Spaces:

romas-458
/

acr

Sleeping

acr

File size: 2,035 Bytes

94ac2ac
f9a3e58
186a3b7
7a417b8
2efbc1b
c1dd4e9
87bebbb
f6eb858
 
 
79d0e18
186a3b7
87bebbb
186a3b7
87bebbb
 
 
 
62e68d5
87bebbb
62e68d5
87bebbb
62e68d5
f9a3e58
 
 
62e68d5
f9a3e58
19a40bb
f9a3e58
 
 
06c4ac4
f9a3e58
c1dd4e9
f9a3e58
 
 
 
06c4ac4
4128d73
 
 
 
 
19a40bb
 
 
 
 
 
 
f9a3e58
94ac2ac
f9a3e58

import streamlit as st
import librosa
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import tempfile
from pydub import AudioSegment

# Define available models
available_models = ["Yehor/whisper-small-ukrainian", "arampacha/whisper-large-uk-2"]
# , "Yehor/wav2vec2-xls-r-300m-uk-with-3gram-news-lm", "Yehor/wav2vec2-xls-r-300m-uk-with-wiki-lm"
# available_models = ["theodotus/stt_uk_squeezeformer_ctc_sm", "arampacha/whisper-large-uk-2"]


st.title("Voice Recognition App")

# Model selection dropdown
model_choice = st.selectbox("Choose a model", available_models)

processor = AutoProcessor.from_pretrained(model_choice)

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_choice)

uploaded_file = st.file_uploader("Choose file", type=["wav", "mp3"])

def map_to_pred(file_path):
    # load audio file
    audio, _ = librosa.load(file_path)

    # preprocess audio and generate standard
    input_features = processor([audio], return_tensors="pt", sampling_rate=16000).input_features
    generated_ids = model.generate(inputs=input_features)
    transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True)
    text = processor.tokenizer._normalize(transcription[0])

    return text
if uploaded_file is not None:
    # convert file object to file path
    file_path = './temp.wav'
    with open(file_path, 'wb') as f:
        f.write(uploaded_file.getbuffer())

    # Save the uploaded file temporarily
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(uploaded_file.read())
        temp_file_path = temp_file.name

    # Convert audio file to a format supported by Whisper (if necessary)
    audio = AudioSegment.from_file(temp_file_path)
    temp_wav_path = tempfile.mktemp(suffix=".wav")
    audio.export(temp_wav_path, format="wav")

    st.audio(uploaded_file, format="audio/wav")

    text = map_to_pred(file_path)

    # display results
    st.write('Input audio:', uploaded_file.name)
    st.write('Predicted standard:', text)