ibm-granite
/

granite-speech-3.2-8b

@@ -33,8 +33,103 @@ logger = logging.get_logger(__name__)
 # 🚨🚨🚨 HACK 🚨🚨🚨
 # This is needed to avoid custom registration issues for now,
 # since we have a custom subclass for the feature extractor as well.
 import transformers
-from .feature_extraction_granite_speech import GraniteSpeechFeatureExtractor
 transformers.GraniteSpeechFeatureExtractor = GraniteSpeechFeatureExtractor
 # The above code is the only change in the modeling code from the following
 # commit on Alex's fork: 397e03a4d76c5f3d8a651e47ade9f27c635e1617

 # 🚨🚨🚨 HACK 🚨🚨🚨
 # This is needed to avoid custom registration issues for now,
 # since we have a custom subclass for the feature extractor as well.
+import math
+from typing import List, Optional
+from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from transformers.utils import is_torch_available, is_torchaudio_available, logging
+if is_torch_available():
+    import torch
+if is_torchaudio_available():
+    import torchaudio
+class GraniteSpeechFeatureExtractor(FeatureExtractionMixin):
+    model_input_names = ["input_features"]
+    def __init__(
+        self,
+        sampling_rate=16000,
+        n_fft=512,
+        win_length=400,
+        hop_length=160,
+        n_mels=80,
+        projector_window_size=15,
+        projector_downsample_rate=5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.melspec_kwargs = {
+            "sample_rate": sampling_rate,
+            "n_fft": n_fft,
+            "win_length": win_length,
+            "hop_length": hop_length,
+            "n_mels": n_mels,
+        }
+        # HACK - for now, lazily initialize the mel spectrogram transform;
+        # the feature extractor mixin explodes otherwise because
+        # it tries to log the feature extractor, and the melspectrogram
+        # transform isn't json serializable...
+        self.melspec = None
+        self.projector_window_size = projector_window_size
+        self.projector_downsample_rate = projector_downsample_rate
+    def _ensure_melspec_transform_is_initialized(self):
+        if self.melspec is None:
+            self.melspec = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
+    def __call__(
+        self,
+        x: torch.Tensor,
+        device: Optional[str] = "cpu",
+    ) -> BatchFeature:
+        # TODO there is probably a better way to do both of these things...
+        self._ensure_melspec_transform_is_initialized()
+        if device is not None:
+            melspec = self.melspec.to(device)
+            x = x.to(device)
+        else:
+            melspec = self.melspec
+        B, _ = x.shape
+        with torch.no_grad():
+            mel = melspec(x.float())
+            logmel = mel.transpose(-1, -2).clip_(min=1e-10).log10_()
+            mx = logmel.amax(dim=(-2, -1), keepdim=True)
+            logmel = torch.maximum(logmel, mx - 8.0).div_(4).add_(1)
+            if logmel.shape[1] % 2 == 1:
+                logmel = logmel[:, :-1]  # remove last frame if odd
+            x = logmel.reshape(B, -1, 2 * logmel.shape[-1])  # stacking and skipping by 2
+        if x.device != "cpu":
+            return x.detach().cpu()
+        return x
+    def _get_num_audio_features(self, audio_lengths: List[int]) -> List[int]:
+        """
+        Gets the (variable length) variable length number of features
+        (i.e., projector output) for the sequences being considered.
+        """
+        hop_length = self.melspec_kwargs["hop_length"]
+        effective_window_size = self.projector_window_size // self.projector_downsample_rate
+        projector_lengths = []
+        for raw_length in audio_lengths:
+            # mel sequence length computation
+            mel_length = raw_length // hop_length + 1
+            # encoder frame takes two mel features
+            encoder_length = mel_length // 2
+            nblocks = math.ceil(encoder_length / self.projector_window_size)
+            # projector output length
+            projector_length = nblocks * effective_window_size
+            projector_lengths.append(projector_length)
+        return projector_lengths
 import transformers
 transformers.GraniteSpeechFeatureExtractor = GraniteSpeechFeatureExtractor
 # The above code is the only change in the modeling code from the following
 # commit on Alex's fork: 397e03a4d76c5f3d8a651e47ade9f27c635e1617