asr-3 / app.py
roman
5th
f262317
import torch
import soundfile as sf
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from pydub import AudioSegment
import streamlit as st
import tempfile
import librosa
# Define available models
available_models = ['Yehor/w2v-bert-2.0-uk']
st.title("Voice Recognition App")
# Model selection dropdown
model_name = st.selectbox("Choose a model", available_models)
# # Config
# device = 'cpu' # 'cuda:0' # or cpu
# sampling_rate = 16_000
# Load the model
asr_model = AutoModelForCTC.from_pretrained(model_name).to('cpu')
processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
# paths = [
# 'short_1_16k.wav',
# ]
def map_to_pred(file_path, sampling_rate = 16_000, device = 'cpu'):
audio_inputs = []
# # load audio file
audio, _ = librosa.load(file_path)
#
# # preprocess audio and generate standard
# input_features = processor([audio], return_tensors="pt", sampling_rate=16000).input_features
# generated_ids = model.generate(inputs=input_features)
# transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True)
# text = processor.tokenizer._normalize(transcription[0])
# audio_input, _ = sf.read(file_path)
# audio_inputs.append(audio_input)
# audio_inputs = AudioSegment.from_file(file_path)
# Transcribe the audio
inputs = processor([audio], sampling_rate=sampling_rate).input_features
# inputs = processor(audio_inputs, sampling_rate=sampling_rate).input_features
features = torch.tensor(inputs).to(device)
with torch.no_grad():
logits = asr_model(features).logits
predicted_ids = torch.argmax(logits, dim=-1)
predictions = processor.batch_decode(predicted_ids)
# Log results
print('Predictions:')
return predictions
# Extract audio
# audio_inputs = []
# for path in paths:
# audio_input, _ = sf.read(path)
# audio_inputs.append(audio_input)
# # Transcribe the audio
# inputs = processor(audio_inputs, sampling_rate=sampling_rate).input_features
# features = torch.tensor(inputs).to(device)
uploaded_file = st.file_uploader("Choose file", type=["wav", "mp3"])
if uploaded_file is not None:
# convert file object to file path
file_path = './temp.wav'
with open(file_path, 'wb') as f:
f.write(uploaded_file.getbuffer())
# Save the uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.read())
temp_file_path = temp_file.name
# Convert audio file to a format supported by Whisper (if necessary)
audio = AudioSegment.from_file(temp_file_path)
temp_wav_path = tempfile.mktemp(suffix=".wav")
audio.export(temp_wav_path, format="wav")
st.audio(uploaded_file, format="audio/wav")
text = map_to_pred(file_path)
# display results
st.write('Input audio:', uploaded_file.name)
st.write('Predicted standard:', text)