Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
from torchaudio.models.decoder import ctc_decoder, download_pretrained_files | |
class SpeechRecognitionEngine: | |
""" | |
ASR engine to transcribe recorded audio. | |
""" | |
def __init__(self, model_file, token_path): | |
self.model = torch.jit.load(model_file) | |
self.model.eval().to('cpu') | |
# Load decoder files and tokens | |
files = download_pretrained_files("librispeech-4-gram") | |
with open(token_path, 'r') as f: | |
tokens = f.read().splitlines() | |
self.decoder = ctc_decoder( | |
lexicon=files.lexicon, | |
tokens=tokens, | |
lm=files.lm, | |
nbest=1, | |
beam_size=50, | |
beam_threshold=25, | |
beam_size_token=20, | |
lm_weight=1.23, | |
word_score=-0.26, | |
) | |
print("Loaded beam search with Ken LM") | |
def transcribe(self, model, featurizer, filename): | |
""" | |
Transcribe audio from a file using the ASR model. | |
""" | |
try: | |
waveform, _ = torchaudio.load(filename) | |
mel = featurizer(waveform).permute(0, 2, 1) # Prepare mel features | |
with torch.inference_mode(): | |
out = model(mel) | |
results = self.decoder(out) | |
return " ".join(results[0][0].words).strip() | |
except Exception as e: | |
raise RuntimeError(f"Error during transcription: {e}") | |