opencampus
/

sign-whisper-german

@@ -1,6 +1,11 @@
 from transformers.models.whisper.configuration_whisper import WhisperConfig
-from typing import List, Literal, Optional, Dict, Any
-import types
 """Custom config to support modification of the Whisper encoder."""
@@ -1152,8 +1157,7 @@ class WhisperEncoder(WhisperPreTrainedModel):
                 )
         inputs_embeds = inputs_embeds.permute(0, 2, 1)
-        embed_pos = self.embed_positions.weight
         sequence_length = hidden_states.shape[1]
         # CUSTOM
@@ -1811,69 +1815,465 @@ class CustomWhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTr
         self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False)
         self.max_target_positions = config.max_target_positions
-        self.patch_generate()
         # Initialize weights and apply final processing
         self.post_init()
-    # CUSTOM (Monkeypatch the generation method)
-    def patch_generate(self):
-        """
-        Monkey patches the WhisperGenerationMixin to use dynamic stride calculation
-        """
-        original_generate = WhisperGenerationMixin.generate
-        def get_conv_stride(self):
-            """Calculate total stride of all conv layers"""
-            total_stride = 1
-            for layer in self.model.encoder.conv_layers:
-                total_stride *= layer.stride[0]
-            return total_stride
-        def generate_wrapper(self, *args, **kwargs):
-            # Store the original function logic
-            original_code = original_generate.__code__
-            # Create a modified version of the function that uses our stride calculation
-            modified_code = types.CodeType(
-                original_code.co_argcount,
-                original_code.co_posonlyargcount,
-                original_code.co_kwonlyargcount,
-                original_code.co_nlocals,
-                original_code.co_stacksize,
-                original_code.co_flags,
-                original_code.co_code.replace(
-                    # Replace the hardcoded stride calculation with our dynamic one
-                    b"self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]",
-                    b"self.get_conv_stride()",
-                ),
-                original_code.co_consts,
-                original_code.co_names,
-                original_code.co_varnames,
-                original_code.co_filename,
-                original_code.co_name,
-                original_code.co_firstlineno,
-                original_code.co_lnotab,
-                original_code.co_freevars,
-                original_code.co_cellvars,
             )
-            # Create a new function with the modified code
-            new_generate = types.FunctionType(
-                modified_code,
-                original_generate.__globals__,
-                original_generate.__name__,
-                original_generate.__defaults__,
-                original_generate.__closure__,
             )
-            # Bind the function to the instance and call it
-            return new_generate(self, *args, **kwargs)
-        # Add the stride calculation method to the mixin
-        WhisperGenerationMixin.get_conv_stride = get_conv_stride
-        # Replace the original generate method
-        WhisperGenerationMixin.generate = generate_wrapper
     def get_encoder(self):
         return self.model.get_encoder()

 from transformers.models.whisper.configuration_whisper import WhisperConfig
+import torch.nn.functional as F
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+    SuppressTokensLogitsProcessor
+)
+from typing import List, Optional, Dict, Any
+import warnings
 """Custom config to support modification of the Whisper encoder."""
                 )
         inputs_embeds = inputs_embeds.permute(0, 2, 1)
         sequence_length = hidden_states.shape[1]
         # CUSTOM
         self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False)
         self.max_target_positions = config.max_target_positions
         # Initialize weights and apply final processing
         self.post_init()
+    # CUSTOM (patch the generation method)
+    def get_conv_stride(self):
+        """Calculate total stride of all conv layers"""
+        total_stride = 1
+        for layer in self.model.encoder.conv_layers:
+            total_stride *= layer.stride[0]
+        return total_stride
+    def generate(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        generation_config: Optional[Any] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[Any] = None,
+        prefix_allowed_tokens_fn: Optional[Any] = None,
+        synced_gpus: bool = False,
+        return_timestamps: Optional[bool] = None,
+        task: Optional[str] = None,
+        language: Optional[Union[str, List[str]]] = None,
+        is_multilingual: Optional[bool] = None,
+        prompt_ids: Optional[torch.Tensor] = None,
+        prompt_condition_type: Optional[str] = None,  # first-segment, all-segments
+        condition_on_prev_tokens: Optional[bool] = None,
+        temperature: Optional[Union[float, Tuple[float, ...]]] = None,
+        compression_ratio_threshold: Optional[float] = None,
+        logprob_threshold: Optional[float] = None,
+        no_speech_threshold: Optional[float] = None,
+        num_segment_frames: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        time_precision: float = 0.02,
+        time_precision_features: float = 0.01,
+        return_token_timestamps: Optional[bool] = None,
+        return_segments: bool = False,
+        return_dict_in_generate: Optional[bool] = None,
+        force_unique_generate_call: Optional[bool] = None,
+        **kwargs,
+    ):
+        # 0. deprecate old inputs
+        if "inputs" in kwargs:
+            input_features = kwargs.pop("inputs")
+            warnings.warn(
+                "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
+                FutureWarning,
             )
+        # 1. prepare generation config
+        generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
+        # 2. set global generate variables
+        input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
+        num_segment_frames = input_stride * self.config.max_source_positions
+        batch_size, total_input_frames = self._retrieve_total_input_frames(
+            input_features=input_features, input_stride=input_stride, kwargs=kwargs
+        )
+        is_shortform = total_input_frames <= num_segment_frames
+        # 3. Make sure generation config is correctly set
+        # Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
+        return_dict_in_generate = self._set_return_outputs(
+            return_dict_in_generate=return_dict_in_generate,
+            return_token_timestamps=return_token_timestamps,
+            logprob_threshold=logprob_threshold,
+            generation_config=generation_config,
+        )
+        timestamp_begin = self._set_return_timestamps(
+            return_timestamps=return_timestamps, is_shortform=is_shortform, generation_config=generation_config
+        )
+        self._set_language_and_task(
+            language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config
+        )
+        self._set_num_frames(
+            return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs
+        )
+        self._set_thresholds_and_condition(
+            generation_config=generation_config,
+            logprob_threshold=logprob_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
+            no_speech_threshold=no_speech_threshold,
+            condition_on_prev_tokens=condition_on_prev_tokens,
+        )
+        self._set_prompt_condition_type(
+            generation_config=generation_config,
+            prompt_condition_type=prompt_condition_type,
+        )
+        # pass self.config for backward compatibility
+        init_tokens = self._retrieve_init_tokens(
+            input_features,
+            batch_size=batch_size,
+            generation_config=generation_config,
+            config=self.config,
+            num_segment_frames=num_segment_frames,
+            kwargs=kwargs,
+        )
+        # passing `decoder_input_ids` is deprecated - the only exception is for assisted generation
+        # where the input ids are handled explicitly by the generate method
+        self._check_decoder_input_ids(kwargs=kwargs)
+        # 3. Retrieve logits processors
+        device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device
+        begin_index = init_tokens.shape[1]
+        num_beams = kwargs.get(
+            "num_beams",
+            generation_config.num_beams
+            if hasattr(generation_config, "num_beams") and generation_config.num_beams is not None
+            else 1,
+        )
+        if "assistant_model" in kwargs:
+            # speculative decoding: the model should be able to return eos token
+            generation_config.begin_suppress_tokens = None
+        logits_processor = self._retrieve_logit_processors(
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            begin_index=begin_index,  # begin index is index of first generated decoder token
+            num_beams=num_beams,
+            device=device,
+        )
+        # 4 Set and retrieve global generation variables
+        self._set_condition_on_prev_tokens(
+            condition_on_prev_tokens=condition_on_prev_tokens, generation_config=generation_config
+        )
+        temperatures = [temperature] if not isinstance(temperature, (list, tuple)) else temperature
+        temperature = temperatures[0]
+        max_frames, seek = self._retrieve_max_frames_and_seek(
+            batch_size=batch_size,
+            attention_mask=attention_mask,
+            total_input_frames=total_input_frames,
+            is_shortform=is_shortform,
+        )
+        # 5 Prepare running variables, list for generation
+        num_return_sequences = generation_config.num_return_sequences
+        (
+            batch_idx_map,
+            cur_bsz,
+            input_features,
+            seek,
+            max_frames,
+            init_tokens,
+            do_condition_on_prev_tokens,
+        ) = self._expand_variables_for_generation(
+            input_features=input_features,
+            seek=seek,
+            max_frames=max_frames,
+            init_tokens=init_tokens,
+            batch_size=batch_size,
+            condition_on_prev_tokens=condition_on_prev_tokens,
+            generation_config=generation_config,
+        )
+        current_segments = self._prepare_segments(
+            prompt_ids=prompt_ids,
+            batch_size=cur_bsz,
+            generation_config=generation_config,
+        )
+        # 5bis speculative decoding: ensure the assistant model does only one call to generate and therefore returns decoder input token ids and eos token id
+        # we set a flag in the generation config to force the model to make only one call to generate and return the decoder input token ids and eos token id
+        if "assistant_model" in kwargs:
+            assistant_model = kwargs["assistant_model"]
+            assistant_model.generation_config.force_unique_generate_call = True
+        if force_unique_generate_call is None:
+            if hasattr(generation_config, "force_unique_generate_call"):
+                force_unique_generate_call = generation_config.force_unique_generate_call
+            elif hasattr(self.generation_config, "force_unique_generate_call"):
+                force_unique_generate_call = self.generation_config.force_unique_generate_call
+            else:
+                force_unique_generate_call = False
+        # 6 Transcribe audio until we reach the end of all input audios
+        while (seek < max_frames).any():
+            # 6.1 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop
+            # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order
+            # to know which original audio is being decoded
+            # Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk
+            input_features, cur_bsz, batch_idx_map = self._maybe_reduce_batch(
+                input_features=input_features,
+                seek=seek,
+                max_frames=max_frames,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+            )
+            time_offset = (
+                seek.to(torch.float32 if device.type == "mps" else torch.float64) * time_precision / input_stride
+            )
+            seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
+            # 6.2 cut out next 30s segment from input features
+            segment_input = self._get_input_segment(
+                input_features=input_features,
+                seek=seek,
+                seek_num_frames=seek_num_frames,
+                num_segment_frames=num_segment_frames,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
             )
+            def _get_attr_from_logit_processors(logits_processor, logit_processor_class, attribute_name):
+                if logits_processor is not None:
+                    logit_processor = next((cls for cls in logits_processor if isinstance(cls, logit_processor_class)), None)
+                    if logit_processor:
+                        return getattr(logit_processor, attribute_name, None)
+                return None
+            # 6.3 prepare decoder input ids
+            suppress_tokens = _get_attr_from_logit_processors(
+                logits_processor, SuppressTokensLogitsProcessor, "suppress_tokens"
+            )
+            decoder_input_ids, kwargs = self._prepare_decoder_input_ids(
+                cur_bsz=cur_bsz,
+                init_tokens=init_tokens,
+                current_segments=current_segments,
+                batch_idx_map=batch_idx_map,
+                do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+                prompt_ids=prompt_ids,
+                generation_config=generation_config,
+                config=self.config,
+                device=init_tokens.device,
+                suppress_tokens=suppress_tokens,
+                timestamp_begin=timestamp_begin,
+                kwargs=kwargs,
+            )
+            # 6.4 set max new tokens or max length
+            self._set_max_new_tokens_and_length(
+                config=self.config,
+                decoder_input_ids=decoder_input_ids,
+                generation_config=generation_config,
+            )
+            # 6.5 Set current `begin_index` for all logit processors
+            if logits_processor is not None:
+                for proc in logits_processor:
+                    if hasattr(proc, "set_begin_index"):
+                        proc.set_begin_index(decoder_input_ids.shape[-1])
+            # 6.6 Run generate with fallback
+            (
+                seek_sequences,
+                seek_outputs,
+                should_skip,
+                do_condition_on_prev_tokens,
+                model_output_type,
+            ) = self.generate_with_fallback(
+                segment_input=segment_input,
+                decoder_input_ids=decoder_input_ids,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+                seek=seek,
+                num_segment_frames=num_segment_frames,
+                max_frames=max_frames,
+                temperatures=temperatures,
+                generation_config=generation_config,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                synced_gpus=synced_gpus,
+                return_token_timestamps=return_token_timestamps,
+                do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+                is_shortform=is_shortform,
+                batch_size=batch_size,
+                attention_mask=attention_mask,
+                kwargs=kwargs,
+            )
+            # 6.7 In every generated sequence, split by timestamp tokens and extract segments
+            for i, seek_sequence in enumerate(seek_sequences):
+                prev_i = batch_idx_map[i]
+                if should_skip[i]:
+                    seek[prev_i] += seek_num_frames[prev_i]
+                    continue
+                segments, segment_offset = self._retrieve_segment(
+                    seek_sequence=seek_sequence,
+                    seek_outputs=seek_outputs,
+                    time_offset=time_offset,
+                    timestamp_begin=timestamp_begin,
+                    seek_num_frames=seek_num_frames,
+                    time_precision=time_precision,
+                    time_precision_features=time_precision_features,
+                    input_stride=input_stride,
+                    prev_idx=prev_i,
+                    idx=i,
+                    return_token_timestamps=return_token_timestamps,
+                    decoder_input_ids=decoder_input_ids,
+                )
+                seek[prev_i] += segment_offset
+                current_segments[prev_i] += segments
+            if force_unique_generate_call:
+                break
+        # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
+        # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
+        final_segments = (
+            [x[1:] for x in current_segments]
+            if (prompt_ids is not None and generation_config.prompt_condition_type == "first-segment")
+            else current_segments
+        )
+        # if return_dict_in_generate=True and we forced a unique call to generate or return_timestamps=False, meaning we are sure only one call to generate has been made,
+        # -> we can return a ModelOutput
+        # otherwise, return_dict_in_generate is applied in the 'result' of each segment in final_segments
+        if (
+            return_dict_in_generate
+            and generation_config.return_dict_in_generate
+            and (force_unique_generate_call or not return_timestamps)
+        ):
+            # only one call to generate_with_fallback, we can return a ModelOutput
+            outputs = self._stack_split_outputs(seek_outputs, model_output_type, self.device, kwargs)
+            if num_return_sequences > 1:
+                if hasattr(outputs, "encoder_attentions") and outputs.encoder_attentions is not None:
+                    outputs.encoder_attentions = tuple(
+                        outputs.encoder_attentions[i][::num_return_sequences]
+                        for i in range(len(outputs.encoder_attentions))
+                    )
+                if hasattr(outputs, "encoder_hidden_states") and outputs.encoder_hidden_states is not None:
+                    outputs.encoder_hidden_states = tuple(
+                        outputs.encoder_hidden_states[i][::num_return_sequences]
+                        for i in range(len(outputs.encoder_hidden_states))
+                    )
+            return outputs
+        def _pad_to_max_length(
+            current_segments,
+            pad_token_id,
+            device,
+            padding_side="right",
+            padding="longest",
+            bos_token_tensor=None,
+            cut_off_length=None,
+            return_token_timestamps=False,
+            force_unique_generate_call=False,
+        ):
+            max_total_length = 0
+            sequences = []
+            token_timestamps_list = []
+            if padding_side not in ["right", "left"]:
+                raise ValueError(f"`padding_side` must be either 'right' or 'left', not {padding_side}")
+            if padding not in ["longest", "max_length"]:
+                raise ValueError(f"`padding` must be either 'longest' or 'max_length', not {padding}")
+            elif padding == "max_length" and cut_off_length is None:
+                raise ValueError("`cut_off_length` must be specified when `padding='max_length'`")
+            if force_unique_generate_call:
+                sequences_list = []
+                timestamps_list = []
+                for segments in current_segments:
+                    result = segments[0]["result"]
+                    sequences_list.append(result if isinstance(result, torch.Tensor) else result["sequences"])
+                    if return_token_timestamps:
+                        timestamps_list.append(result["token_timestamps"])
+                sequences = torch.stack(sequences_list, dim=0)
+                if return_token_timestamps:
+                    token_timestamps = torch.stack(timestamps_list, dim=0)
+                    return sequences, token_timestamps
+                return sequences
+            for current_segment_list in current_segments:
+                if current_segment_list is not None and len([d["tokens"] for d in current_segment_list]) > 0:
+                    sequence = torch.cat([d["tokens"] for d in current_segment_list], dim=-1)
+                    if return_token_timestamps:
+                        token_timestamps = torch.cat(
+                            [d["result"]["token_timestamps"][d["idxs"][0] : d["idxs"][1]] for d in current_segment_list],
+                            dim=-1,
+                        )
+                    if cut_off_length is not None:
+                        sequence = sequence[-cut_off_length:]
+                        if return_token_timestamps:
+                            token_timestamps = token_timestamps[-cut_off_length:]
+                    if bos_token_tensor is not None:
+                        sequence = torch.cat([bos_token_tensor, sequence])
+                        if return_token_timestamps:
+                            token_timestamps = torch.cat(
+                                [torch.ones_like(bos_token_tensor, device=device) * 0.0, token_timestamps]
+                            )
+                    sequences.append(sequence)
+                    if return_token_timestamps:
+                        token_timestamps_list.append(token_timestamps)
+                    max_total_length = max(max_total_length, len(sequences[-1]))
+                elif bos_token_tensor is not None:
+                    sequences.append(bos_token_tensor)
+                    if return_token_timestamps:
+                        token_timestamps_list.append(torch.ones_like(bos_token_tensor, device=device) * 0.0)
+                else:
+                    sequences.append(torch.tensor([], device=device))
+                    if return_token_timestamps:
+                        token_timestamps_list.append(torch.tensor([], device=device))
+            max_total_length = cut_off_length + 1 if padding == "max_length" else max_total_length
+            for i in range(len(current_segments)):
+                pad_length = max_total_length - len(sequences[i])
+                pad = (0, pad_length) if padding_side == "right" else (pad_length, 0)
+                sequences[i] = F.pad(sequences[i], pad=pad, value=pad_token_id)
+                if return_token_timestamps:
+                    token_timestamps_list[i] = F.pad(
+                        token_timestamps_list[i],
+                        pad=pad,
+                        value=token_timestamps_list[i][-1] if len(token_timestamps_list[i]) > 0 else 0.0,
+                    )
+            sequences = torch.stack(sequences, dim=0)
+            if return_token_timestamps:
+                token_timestamps = torch.stack(token_timestamps_list, dim=0)
+                return sequences, token_timestamps
+            else:
+                return sequences
+        padded_outputs = _pad_to_max_length(
+            current_segments=final_segments,
+            pad_token_id=generation_config.pad_token_id,
+            device=self.device,
+            padding_side="right",
+            return_token_timestamps=return_token_timestamps,
+            force_unique_generate_call=force_unique_generate_call,
+        )
+        if return_dict_in_generate and generation_config.return_dict_in_generate:
+            logger.warning_once(
+                "You have passed `return_dict_in_generate=True` and `return_timestamps=True`, this automatically sets `return_segments=True` to access the resuls of the underlying calls to GenerationMixin's generate in the returned `segments`."
+            )
+            return_segments = True
+        elif not return_segments and not return_token_timestamps:
+            return padded_outputs
+        if return_token_timestamps:
+            sequences, token_timestamps = padded_outputs
+            outputs = {
+                "sequences": sequences,
+                "token_timestamps": token_timestamps,
+            }
+        else:
+            sequences = padded_outputs
+            outputs = {
+                "sequences": sequences,
+            }
+        if return_segments:
+            outputs["segments"] = final_segments
+        return outputs
     def get_encoder(self):
         return self.model.get_encoder()