File size: 2,035 Bytes
94ac2ac f9a3e58 186a3b7 7a417b8 2efbc1b c1dd4e9 87bebbb f6eb858 79d0e18 186a3b7 87bebbb 186a3b7 87bebbb 62e68d5 87bebbb 62e68d5 87bebbb 62e68d5 f9a3e58 62e68d5 f9a3e58 19a40bb f9a3e58 06c4ac4 f9a3e58 c1dd4e9 f9a3e58 06c4ac4 4128d73 19a40bb f9a3e58 94ac2ac f9a3e58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import streamlit as st
import librosa
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import tempfile
from pydub import AudioSegment
# Define available models
available_models = ["Yehor/whisper-small-ukrainian", "arampacha/whisper-large-uk-2"]
# , "Yehor/wav2vec2-xls-r-300m-uk-with-3gram-news-lm", "Yehor/wav2vec2-xls-r-300m-uk-with-wiki-lm"
# available_models = ["theodotus/stt_uk_squeezeformer_ctc_sm", "arampacha/whisper-large-uk-2"]
st.title("Voice Recognition App")
# Model selection dropdown
model_choice = st.selectbox("Choose a model", available_models)
processor = AutoProcessor.from_pretrained(model_choice)
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_choice)
uploaded_file = st.file_uploader("Choose file", type=["wav", "mp3"])
def map_to_pred(file_path):
# load audio file
audio, _ = librosa.load(file_path)
# preprocess audio and generate standard
input_features = processor([audio], return_tensors="pt", sampling_rate=16000).input_features
generated_ids = model.generate(inputs=input_features)
transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True)
text = processor.tokenizer._normalize(transcription[0])
return text
if uploaded_file is not None:
# convert file object to file path
file_path = './temp.wav'
with open(file_path, 'wb') as f:
f.write(uploaded_file.getbuffer())
# Save the uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.read())
temp_file_path = temp_file.name
# Convert audio file to a format supported by Whisper (if necessary)
audio = AudioSegment.from_file(temp_file_path)
temp_wav_path = tempfile.mktemp(suffix=".wav")
audio.export(temp_wav_path, format="wav")
st.audio(uploaded_file, format="audio/wav")
text = map_to_pred(file_path)
# display results
st.write('Input audio:', uploaded_file.name)
st.write('Predicted standard:', text) |