opencampus
/

sign-whisper-german

@@ -1801,17 +1801,61 @@ class WhisperModel(WhisperPreTrainedModel):
             encoder_attentions=encoder_outputs.attentions,
         )
 # CUSTOM (patch the generation method)
 class CustomWhisperGenerationMixin(WhisperGenerationMixin):
     def generate(
         self,
         input_features: Optional[torch.Tensor] = None,
-        generation_config: Optional[Any] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[Any] = None,
-        prefix_allowed_tokens_fn: Optional[Any] = None,
         synced_gpus: bool = False,
         return_timestamps: Optional[bool] = None,
         task: Optional[str] = None,
@@ -1827,11 +1871,9 @@ class CustomWhisperGenerationMixin(WhisperGenerationMixin):
         num_segment_frames: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         time_precision: float = 0.02,
-        time_precision_features: float = 0.01,
         return_token_timestamps: Optional[bool] = None,
         return_segments: bool = False,
         return_dict_in_generate: Optional[bool] = None,
-        force_unique_generate_call: Optional[bool] = None,
         **kwargs,
     ):
         # 0. deprecate old inputs
@@ -1846,7 +1888,7 @@ class CustomWhisperGenerationMixin(WhisperGenerationMixin):
         generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
         # 2. set global generate variables
-        input_stride = self.model.encoder.get_conv_stride()
         num_segment_frames = input_stride * self.config.max_source_positions
         batch_size, total_input_frames = self._retrieve_total_input_frames(
             input_features=input_features, input_stride=input_stride, kwargs=kwargs
@@ -1898,21 +1940,11 @@ class CustomWhisperGenerationMixin(WhisperGenerationMixin):
         # 3. Retrieve logits processors
         device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device
         begin_index = init_tokens.shape[1]
-        num_beams = kwargs.get(
-            "num_beams",
-            generation_config.num_beams
-            if hasattr(generation_config, "num_beams") and generation_config.num_beams is not None
-            else 1,
-        )
-        if "assistant_model" in kwargs:
-            # speculative decoding: the model should be able to return eos token
-            generation_config.begin_suppress_tokens = None
         logits_processor = self._retrieve_logit_processors(
             generation_config=generation_config,
             logits_processor=logits_processor,
             begin_index=begin_index,  # begin index is index of first generated decoder token
-            num_beams=num_beams,
             device=device,
         )
@@ -1956,19 +1988,6 @@ class CustomWhisperGenerationMixin(WhisperGenerationMixin):
             batch_size=cur_bsz,
             generation_config=generation_config,
         )
-        # 5bis speculative decoding: ensure the assistant model does only one call to generate and therefore returns decoder input token ids and eos token id
-        # we set a flag in the generation config to force the model to make only one call to generate and return the decoder input token ids and eos token id
-        if "assistant_model" in kwargs:
-            assistant_model = kwargs["assistant_model"]
-            assistant_model.generation_config.force_unique_generate_call = True
-        if force_unique_generate_call is None:
-            if hasattr(generation_config, "force_unique_generate_call"):
-                force_unique_generate_call = generation_config.force_unique_generate_call
-            elif hasattr(self.generation_config, "force_unique_generate_call"):
-                force_unique_generate_call = self.generation_config.force_unique_generate_call
-            else:
-                force_unique_generate_call = False
         # 6 Transcribe audio until we reach the end of all input audios
         while (seek < max_frames).any():
@@ -1983,9 +2002,7 @@ class CustomWhisperGenerationMixin(WhisperGenerationMixin):
                 cur_bsz=cur_bsz,
                 batch_idx_map=batch_idx_map,
             )
-            time_offset = (
-                seek.to(torch.float32 if device.type == "mps" else torch.float64) * time_precision / input_stride
-            )
             seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
             # 6.2 cut out next 30s segment from input features
@@ -1997,13 +2014,6 @@ class CustomWhisperGenerationMixin(WhisperGenerationMixin):
                 cur_bsz=cur_bsz,
                 batch_idx_map=batch_idx_map,
             )
-            def _get_attr_from_logit_processors(logits_processor, logit_processor_class, attribute_name):
-                if logits_processor is not None:
-                    logit_processor = next((cls for cls in logits_processor if isinstance(cls, logit_processor_class)), None)
-                    if logit_processor:
-                        return getattr(logit_processor, attribute_name, None)
-                return None
             # 6.3 prepare decoder input ids
             suppress_tokens = _get_attr_from_logit_processors(
@@ -2021,7 +2031,6 @@ class CustomWhisperGenerationMixin(WhisperGenerationMixin):
                 config=self.config,
                 device=init_tokens.device,
                 suppress_tokens=suppress_tokens,
-                timestamp_begin=timestamp_begin,
                 kwargs=kwargs,
             )
@@ -2082,20 +2091,18 @@ class CustomWhisperGenerationMixin(WhisperGenerationMixin):
                     timestamp_begin=timestamp_begin,
                     seek_num_frames=seek_num_frames,
                     time_precision=time_precision,
-                    time_precision_features=time_precision_features,
                     input_stride=input_stride,
                     prev_idx=prev_i,
                     idx=i,
                     return_token_timestamps=return_token_timestamps,
-                    decoder_input_ids=decoder_input_ids,
                 )
-                seek[prev_i] += segment_offset
                 current_segments[prev_i] += segments
-            if force_unique_generate_call:
-                break
         # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
         # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
@@ -2105,154 +2112,51 @@ class CustomWhisperGenerationMixin(WhisperGenerationMixin):
             else current_segments
         )
-        # if return_dict_in_generate=True and we forced a unique call to generate or return_timestamps=False, meaning we are sure only one call to generate has been made,
-        # -> we can return a ModelOutput
-        # otherwise, return_dict_in_generate is applied in the 'result' of each segment in final_segments
-        if (
-            return_dict_in_generate
-            and generation_config.return_dict_in_generate
-            and (force_unique_generate_call or not return_timestamps)
-        ):
-            # only one call to generate_with_fallback, we can return a ModelOutput
-            outputs = self._stack_split_outputs(seek_outputs, model_output_type, self.device, kwargs)
-            if num_return_sequences > 1:
-                if hasattr(outputs, "encoder_attentions") and outputs.encoder_attentions is not None:
-                    outputs.encoder_attentions = tuple(
-                        outputs.encoder_attentions[i][::num_return_sequences]
-                        for i in range(len(outputs.encoder_attentions))
-                    )
-                if hasattr(outputs, "encoder_hidden_states") and outputs.encoder_hidden_states is not None:
-                    outputs.encoder_hidden_states = tuple(
-                        outputs.encoder_hidden_states[i][::num_return_sequences]
-                        for i in range(len(outputs.encoder_hidden_states))
-                    )
-            return outputs
-        def _pad_to_max_length(
-            current_segments,
-            pad_token_id,
-            device,
-            padding_side="right",
-            padding="longest",
-            bos_token_tensor=None,
-            cut_off_length=None,
-            return_token_timestamps=False,
-            force_unique_generate_call=False,
-        ):
-            max_total_length = 0
-            sequences = []
-            token_timestamps_list = []
-            if padding_side not in ["right", "left"]:
-                raise ValueError(f"`padding_side` must be either 'right' or 'left', not {padding_side}")
-            if padding not in ["longest", "max_length"]:
-                raise ValueError(f"`padding` must be either 'longest' or 'max_length', not {padding}")
-            elif padding == "max_length" and cut_off_length is None:
-                raise ValueError("`cut_off_length` must be specified when `padding='max_length'`")
-            if force_unique_generate_call:
-                sequences_list = []
-                timestamps_list = []
-                for segments in current_segments:
-                    result = segments[0]["result"]
-                    sequences_list.append(result if isinstance(result, torch.Tensor) else result["sequences"])
-                    if return_token_timestamps:
-                        timestamps_list.append(result["token_timestamps"])
-                sequences = torch.stack(sequences_list, dim=0)
-                if return_token_timestamps:
-                    token_timestamps = torch.stack(timestamps_list, dim=0)
-                    return sequences, token_timestamps
-                return sequences
-            for current_segment_list in current_segments:
-                if current_segment_list is not None and len([d["tokens"] for d in current_segment_list]) > 0:
-                    sequence = torch.cat([d["tokens"] for d in current_segment_list], dim=-1)
-                    if return_token_timestamps:
-                        token_timestamps = torch.cat(
-                            [d["result"]["token_timestamps"][d["idxs"][0] : d["idxs"][1]] for d in current_segment_list],
-                            dim=-1,
-                        )
-                    if cut_off_length is not None:
-                        sequence = sequence[-cut_off_length:]
-                        if return_token_timestamps:
-                            token_timestamps = token_timestamps[-cut_off_length:]
-                    if bos_token_tensor is not None:
-                        sequence = torch.cat([bos_token_tensor, sequence])
-                        if return_token_timestamps:
-                            token_timestamps = torch.cat(
-                                [torch.ones_like(bos_token_tensor, device=device) * 0.0, token_timestamps]
-                            )
-                    sequences.append(sequence)
-                    if return_token_timestamps:
-                        token_timestamps_list.append(token_timestamps)
-                    max_total_length = max(max_total_length, len(sequences[-1]))
-                elif bos_token_tensor is not None:
-                    sequences.append(bos_token_tensor)
-                    if return_token_timestamps:
-                        token_timestamps_list.append(torch.ones_like(bos_token_tensor, device=device) * 0.0)
-                else:
-                    sequences.append(torch.tensor([], device=device))
-                    if return_token_timestamps:
-                        token_timestamps_list.append(torch.tensor([], device=device))
-            max_total_length = cut_off_length + 1 if padding == "max_length" else max_total_length
-            for i in range(len(current_segments)):
-                pad_length = max_total_length - len(sequences[i])
-                pad = (0, pad_length) if padding_side == "right" else (pad_length, 0)
-                sequences[i] = F.pad(sequences[i], pad=pad, value=pad_token_id)
-                if return_token_timestamps:
-                    token_timestamps_list[i] = F.pad(
-                        token_timestamps_list[i],
-                        pad=pad,
-                        value=token_timestamps_list[i][-1] if len(token_timestamps_list[i]) > 0 else 0.0,
-                    )
-            sequences = torch.stack(sequences, dim=0)
             if return_token_timestamps:
-                token_timestamps = torch.stack(token_timestamps_list, dim=0)
-                return sequences, token_timestamps
             else:
-                return sequences
-        padded_outputs = _pad_to_max_length(
-            current_segments=final_segments,
-            pad_token_id=generation_config.pad_token_id,
-            device=self.device,
-            padding_side="right",
-            return_token_timestamps=return_token_timestamps,
-            force_unique_generate_call=force_unique_generate_call,
-        )
-        if return_dict_in_generate and generation_config.return_dict_in_generate:
-            logger.warning_once(
-                "You have passed `return_dict_in_generate=True` and `return_timestamps=True`, this automatically sets `return_segments=True` to access the resuls of the underlying calls to GenerationMixin's generate in the returned `segments`."
-            )
-            return_segments = True
-        elif not return_segments and not return_token_timestamps:
-            return padded_outputs
-        if return_token_timestamps:
-            sequences, token_timestamps = padded_outputs
-            outputs = {
-                "sequences": sequences,
-                "token_timestamps": token_timestamps,
-            }
-        else:
-            sequences = padded_outputs
-            outputs = {
-                "sequences": sequences,
-            }
-        if return_segments:
-            outputs["segments"] = final_segments
-        return outputs
 @add_start_docstrings(
     "The Whisper Model with a language modeling head. Can be used for automatic speech recognition.",
@@ -2270,132 +2174,6 @@ class CustomWhisperForConditionalGeneration(CustomWhisperGenerationMixin, Whispe
         # Initialize weights and apply final processing
         self.post_init()
-        def _pad_to_max_length(
-            current_segments,
-            pad_token_id,
-            device,
-            padding_side="right",
-            padding="longest",
-            bos_token_tensor=None,
-            cut_off_length=None,
-            return_token_timestamps=False,
-            force_unique_generate_call=False,
-        ):
-            max_total_length = 0
-            sequences = []
-            token_timestamps_list = []
-            if padding_side not in ["right", "left"]:
-                raise ValueError(f"`padding_side` must be either 'right' or 'left', not {padding_side}")
-            if padding not in ["longest", "max_length"]:
-                raise ValueError(f"`padding` must be either 'longest' or 'max_length', not {padding}")
-            elif padding == "max_length" and cut_off_length is None:
-                raise ValueError("`cut_off_length` must be specified when `padding='max_length'`")
-            if force_unique_generate_call:
-                sequences_list = []
-                timestamps_list = []
-                for segments in current_segments:
-                    result = segments[0]["result"]
-                    sequences_list.append(result if isinstance(result, torch.Tensor) else result["sequences"])
-                    if return_token_timestamps:
-                        timestamps_list.append(result["token_timestamps"])
-                sequences = torch.stack(sequences_list, dim=0)
-                if return_token_timestamps:
-                    token_timestamps = torch.stack(timestamps_list, dim=0)
-                    return sequences, token_timestamps
-                return sequences
-            for current_segment_list in current_segments:
-                if current_segment_list is not None and len([d["tokens"] for d in current_segment_list]) > 0:
-                    sequence = torch.cat([d["tokens"] for d in current_segment_list], dim=-1)
-                    if return_token_timestamps:
-                        token_timestamps = torch.cat(
-                            [d["result"]["token_timestamps"][d["idxs"][0] : d["idxs"][1]] for d in current_segment_list],
-                            dim=-1,
-                        )
-                    if cut_off_length is not None:
-                        sequence = sequence[-cut_off_length:]
-                        if return_token_timestamps:
-                            token_timestamps = token_timestamps[-cut_off_length:]
-                    if bos_token_tensor is not None:
-                        sequence = torch.cat([bos_token_tensor, sequence])
-                        if return_token_timestamps:
-                            token_timestamps = torch.cat(
-                                [torch.ones_like(bos_token_tensor, device=device) * 0.0, token_timestamps]
-                            )
-                    sequences.append(sequence)
-                    if return_token_timestamps:
-                        token_timestamps_list.append(token_timestamps)
-                    max_total_length = max(max_total_length, len(sequences[-1]))
-                elif bos_token_tensor is not None:
-                    sequences.append(bos_token_tensor)
-                    if return_token_timestamps:
-                        token_timestamps_list.append(torch.ones_like(bos_token_tensor, device=device) * 0.0)
-                else:
-                    sequences.append(torch.tensor([], device=device))
-                    if return_token_timestamps:
-                        token_timestamps_list.append(torch.tensor([], device=device))
-            max_total_length = cut_off_length + 1 if padding == "max_length" else max_total_length
-            for i in range(len(current_segments)):
-                pad_length = max_total_length - len(sequences[i])
-                pad = (0, pad_length) if padding_side == "right" else (pad_length, 0)
-                sequences[i] = F.pad(sequences[i], pad=pad, value=pad_token_id)
-                if return_token_timestamps:
-                    token_timestamps_list[i] = F.pad(
-                        token_timestamps_list[i],
-                        pad=pad,
-                        value=token_timestamps_list[i][-1] if len(token_timestamps_list[i]) > 0 else 0.0,
-                    )
-            sequences = torch.stack(sequences, dim=0)
-            if return_token_timestamps:
-                token_timestamps = torch.stack(token_timestamps_list, dim=0)
-                return sequences, token_timestamps
-            else:
-                return sequences
-        padded_outputs = _pad_to_max_length(
-            current_segments=final_segments,
-            pad_token_id=generation_config.pad_token_id,
-            device=self.device,
-            padding_side="right",
-            return_token_timestamps=return_token_timestamps,
-            force_unique_generate_call=force_unique_generate_call,
-        )
-        if return_dict_in_generate and generation_config.return_dict_in_generate:
-            logger.warning_once(
-                "You have passed `return_dict_in_generate=True` and `return_timestamps=True`, this automatically sets `return_segments=True` to access the resuls of the underlying calls to GenerationMixin's generate in the returned `segments`."
-            )
-            return_segments = True
-        elif not return_segments and not return_token_timestamps:
-            return padded_outputs
-        if return_token_timestamps:
-            sequences, token_timestamps = padded_outputs
-            outputs = {
-                "sequences": sequences,
-                "token_timestamps": token_timestamps,
-            }
-        else:
-            sequences = padded_outputs
-            outputs = {
-                "sequences": sequences,
-            }
-        if return_segments:
-            outputs["segments"] = final_segments
-        return outputs
     def get_encoder(self):
         return self.model.get_encoder()

             encoder_attentions=encoder_outputs.attentions,
         )
+def _pad_to_max_length(
+    current_segments,
+    pad_token_id,
+    device,
+    padding_side="right",
+    padding="longest",
+    bos_token_tensor=None,
+    cut_off_length=None,
+):
+    max_total_length = 0
+    sequences = []
+    if padding_side not in ["right", "left"]:
+        raise ValueError(f"`padding_side` must be either 'right' or 'left', not {padding_side}")
+    if padding not in ["longest", "max_length"]:
+        raise ValueError(f"`padding` must be either 'longest' or 'max_length', not {padding}")
+    elif padding == "max_length" and cut_off_length is None:
+        raise ValueError("`cut_off_length` must be specified when `padding='max_length'`")
+    for current_segment_list in current_segments:
+        if current_segment_list is not None and len([d["tokens"] for d in current_segment_list]) > 0:
+            sequence = torch.cat([d["tokens"] for d in current_segment_list], dim=-1)
+            if cut_off_length is not None:
+                sequence = sequence[-cut_off_length:]
+            if bos_token_tensor is not None:
+                sequence = torch.cat([bos_token_tensor, sequence])
+            sequences.append(sequence)
+            max_total_length = max(max_total_length, len(sequences[-1]))
+        elif bos_token_tensor is not None:
+            sequences.append(bos_token_tensor)
+        else:
+            sequences.append(torch.tensor([], device=device))
+    max_total_length = cut_off_length + 1 if padding == "max_length" else max_total_length
+    for i in range(len(current_segments)):
+        pad_length = max_total_length - len(sequences[i])
+        pad = (0, pad_length) if padding_side == "right" else (pad_length, 0)
+        sequences[i] = F.pad(sequences[i], pad=pad, value=pad_token_id)
+    sequences = torch.stack(sequences, dim=0)
+    return sequences
 # CUSTOM (patch the generation method)
 class CustomWhisperGenerationMixin(WhisperGenerationMixin):
     def generate(
         self,
         input_features: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
         synced_gpus: bool = False,
         return_timestamps: Optional[bool] = None,
         task: Optional[str] = None,
         num_segment_frames: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         time_precision: float = 0.02,
         return_token_timestamps: Optional[bool] = None,
         return_segments: bool = False,
         return_dict_in_generate: Optional[bool] = None,
         **kwargs,
     ):
         # 0. deprecate old inputs
         generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
         # 2. set global generate variables
+        input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
         num_segment_frames = input_stride * self.config.max_source_positions
         batch_size, total_input_frames = self._retrieve_total_input_frames(
             input_features=input_features, input_stride=input_stride, kwargs=kwargs
         # 3. Retrieve logits processors
         device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device
         begin_index = init_tokens.shape[1]
         logits_processor = self._retrieve_logit_processors(
             generation_config=generation_config,
             logits_processor=logits_processor,
             begin_index=begin_index,  # begin index is index of first generated decoder token
+            num_beams=kwargs.get("num_beams", 1),
             device=device,
         )
             batch_size=cur_bsz,
             generation_config=generation_config,
         )
         # 6 Transcribe audio until we reach the end of all input audios
         while (seek < max_frames).any():
                 cur_bsz=cur_bsz,
                 batch_idx_map=batch_idx_map,
             )
+            time_offset = seek * time_precision / input_stride
             seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
             # 6.2 cut out next 30s segment from input features
                 cur_bsz=cur_bsz,
                 batch_idx_map=batch_idx_map,
             )
             # 6.3 prepare decoder input ids
             suppress_tokens = _get_attr_from_logit_processors(
                 config=self.config,
                 device=init_tokens.device,
                 suppress_tokens=suppress_tokens,
                 kwargs=kwargs,
             )
                     timestamp_begin=timestamp_begin,
                     seek_num_frames=seek_num_frames,
                     time_precision=time_precision,
                     input_stride=input_stride,
                     prev_idx=prev_i,
                     idx=i,
                     return_token_timestamps=return_token_timestamps,
                 )
                 current_segments[prev_i] += segments
+                if is_shortform:
+                    seek[prev_i] += max_frames[i]
+                else:
+                    seek[prev_i] += segment_offset
         # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
         # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
             else current_segments
         )
+        sequences = _pad_to_max_length(
+            final_segments, generation_config.pad_token_id, device=self.device, padding_side="right"
+        )
+        # 8. If we return all segments, the predicted output sequences are put under `"sequences"`.
+        if return_segments:
+            return {"sequences": sequences, "segments": final_segments}
+        if is_shortform:
+            # add eos token:
+            if generation_config.max_new_tokens is None and generation_config.max_length is None:
+                eos_tokens = torch.full((sequences.shape[0], 1), generation_config.eos_token_id)
+                sequences = torch.cat([sequences, eos_tokens], dim=-1)
             if return_token_timestamps:
+                outputs = {}
+                outputs["sequences"] = sequences
+                outputs["token_timestamps"] = torch.stack([d["token_timestamps"] for d in seek_outputs], dim=0)
             else:
+                outputs = sequences
+            if return_dict_in_generate and generation_config.return_dict_in_generate:
+                dict_outputs = self._stack_split_outputs(seek_outputs, model_output_type, sequences.device, kwargs)
+                if num_return_sequences > 1:
+                    if hasattr(dict_outputs, "encoder_attentions") and dict_outputs.encoder_attentions is not None:
+                        dict_outputs.encoder_attentions = tuple(
+                            dict_outputs.encoder_attentions[i][::num_return_sequences]
+                            for i in range(len(dict_outputs.encoder_attentions))
+                        )
+                    if (
+                        hasattr(dict_outputs, "encoder_hidden_states")
+                        and dict_outputs.encoder_hidden_states is not None
+                    ):
+                        dict_outputs.encoder_hidden_states = tuple(
+                            dict_outputs.encoder_hidden_states[i][::num_return_sequences]
+                            for i in range(len(dict_outputs.encoder_hidden_states))
+                        )
+                if return_token_timestamps:
+                    dict_outputs["token_timestamps"] = outputs["token_timestamps"]
+                return dict_outputs
+            return outputs
+        return sequences
 @add_start_docstrings(
     "The Whisper Model with a language modeling head. Can be used for automatic speech recognition.",
         # Initialize weights and apply final processing
         self.post_init()
     def get_encoder(self):
         return self.model.get_encoder()