Spaces:
Running
Running
# This module handles model inference | |
import torch | |
from transformers import AutoProcessor, AutoModelForCTC | |
DEVICE = ( | |
"cuda" | |
if torch.cuda.is_available() | |
else "mps" if torch.backends.mps.is_available() else "cpu" | |
) | |
# set espeak library path for macOS | |
import sys | |
if sys.platform == "darwin": | |
from phonemizer.backend.espeak.wrapper import EspeakWrapper | |
_ESPEAK_LIBRARY = "/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib" | |
EspeakWrapper.set_library(_ESPEAK_LIBRARY) | |
def clear_cache(): | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
torch.cuda.ipc_collect() | |
if torch.backends.mps.is_available(): | |
torch.mps.empty_cache() | |
def load_model(model_id, device=DEVICE): | |
processor = AutoProcessor.from_pretrained(model_id) | |
model = AutoModelForCTC.from_pretrained(model_id).to(device) | |
return model, processor | |
def transcribe(audio, model, processor) -> str: | |
input_values = ( | |
processor( | |
[audio], | |
sampling_rate=processor.feature_extractor.sampling_rate, | |
return_tensors="pt", | |
padding=True, | |
) | |
.input_values.type(torch.float32) | |
.to(model.device) | |
) | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
return processor.decode(predicted_ids[0]) | |