#import librosa import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer import streamlit as st from audio_recorder_streamlit import audio_recorder audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000) if audio_bytes: st.audio(audio_bytes, format="audio/wav") #load pre-trained model and tokenizer tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") #load audio file #speech, rate = librosa.load("/hip-voice.m4a",sr=16000) #import IPython.display as display #display.Audio("batman1.wav", autoplay=True) input_values = tokenizer(audio_bytes, return_tensors = 'pt').input_values #input_values = tokenizer(speech, return_tensors = 'pt').input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim =-1) #decode the audio to generate text transcriptions = tokenizer.decode(predicted_ids[0]) print(transcriptions)