Spaces:
Runtime error
Runtime error
| import numpy as np | |
| from pathlib import Path | |
| import jiwer | |
| import pdb | |
| import torch.nn as nn | |
| import torch | |
| import torchaudio | |
| from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC | |
| import yaml | |
| import librosa | |
| import librosa.display | |
| import matplotlib.pyplot as plt | |
| import soundfile as sf | |
| def TOKENLIZER(audio_path): | |
| token_model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h") | |
| tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h") | |
| feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") | |
| # # load first sample of English common_voice | |
| # dataset = load_dataset("common_voice", "en", split="train", streaming=True) | |
| # dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) | |
| # dataset_iter = iter(dataset) | |
| # sample = next(dataset_iter) | |
| # # forward sample through model to get greedily predicted transcription ids | |
| # input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values | |
| # pdb.set_trace() | |
| # load samples | |
| input_values, sr = torchaudio.load(audio_path) | |
| # resample | |
| if sr != feature_extractor.sampling_rate: | |
| input_values = torchaudio.functional.resample(input_values, sr, feature_extractor.sampling_rate) | |
| logits = token_model(input_values).logits[0] | |
| # Get predict IDs | |
| pred_ids = torch.argmax(logits, axis=-1) | |
| # retrieve word stamps (analogous commands for `output_char_offsets`) | |
| outputs = tokenizer.decode(pred_ids, output_word_offsets=True) | |
| # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate | |
| time_offset = token_model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate | |
| word_offsets = [ | |
| { | |
| "word": d["word"], | |
| "start_time": round(d["start_offset"] * time_offset, 2), | |
| "end_time": round(d["end_offset"] * time_offset, 2), | |
| } | |
| for d in outputs.word_offsets | |
| ] | |
| return word_offsets |