Spaces:
Running
Running
| # This module handles model inference | |
| import torch | |
| from transformers import AutoProcessor, AutoModelForCTC | |
| DEVICE = ( | |
| "cuda" | |
| if torch.cuda.is_available() | |
| else "mps" if torch.backends.mps.is_available() else "cpu" | |
| ) | |
| # set espeak library path for macOS | |
| import sys | |
| if sys.platform == "darwin": | |
| from phonemizer.backend.espeak.wrapper import EspeakWrapper | |
| _ESPEAK_LIBRARY = "/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib" | |
| EspeakWrapper.set_library(_ESPEAK_LIBRARY) | |
| def clear_cache(): | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| torch.cuda.ipc_collect() | |
| if torch.backends.mps.is_available(): | |
| torch.mps.empty_cache() | |
| def load_model(model_id, device=DEVICE): | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = AutoModelForCTC.from_pretrained(model_id).to(device) | |
| return model, processor | |
| def transcribe(audio, model, processor) -> str: | |
| input_values = ( | |
| processor( | |
| [audio], | |
| sampling_rate=processor.feature_extractor.sampling_rate, | |
| return_tensors="pt", | |
| padding=True, | |
| ) | |
| .input_values.type(torch.float32) | |
| .to(model.device) | |
| ) | |
| with torch.no_grad(): | |
| logits = model(input_values).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| return processor.decode(predicted_ids[0]) | |