Spaces:
Sleeping
Sleeping
File size: 4,945 Bytes
bc94d2b 5487b58 913fbb4 2910afc bc94d2b 913fbb4 bc94d2b 913fbb4 bc94d2b 913fbb4 bc94d2b 8b6c021 3fa4fb2 f36d376 3fa4fb2 f36d376 3fa4fb2 f36d376 3fa4fb2 f36d376 18922c4 8b6c021 bc94d2b 8b6c021 bc94d2b 8b6c021 bc94d2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import streamlit as st
import requests
import Levenshtein
import time
from io import BytesIO
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from audio_recorder_streamlit import audio_recorder
@st.cache_resource
def load_model():
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
return processor, model
processor, model = load_model()
def transcribe_audio_hf(audio_bytes):
speech_array, sampling_rate = librosa.load(BytesIO(audio_bytes), sr=16000)
input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0].strip()
return transcription
def levenshtein_similarity(transcription1, transcription2):
distance = Levenshtein.distance(transcription1, transcription2)
max_len = max(len(transcription1), len(transcription2))
return 1 - distance / max_len # Normalize to get similarity score
def evaluate_audio_similarity(original_audio_bytes, user_audio_bytes):
transcription_original = transcribe_audio_hf(original_audio_bytes)
transcription_user = transcribe_audio_hf(user_audio_bytes)
similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
return transcription_original, transcription_user, similarity_score_levenshtein
st.title("Audio Transcription and Similarity Checker")
# Initialize the session state to control the view
if 'initialized' not in st.session_state:
st.session_state['initialized'] = False
# Button to initialize the recorders
if not st.session_state['initialized']:
st.write("Click the Loader below to initialize the audio recorders.")
init_button = audio_recorder(
text="Click to Initialize",
recording_color="#e8b62c",
neutral_color="#6aa36f",
pause_threshold=0.2,
icon_name="play-circle", # A nice play icon to signify starting the initialization
icon_size="4x",
auto_start=False
)
if init_button:
st.session_state['initialized'] = True
# If initialized, display the recorders
if st.session_state['initialized']:
st.subheader("Record or Upload Original Audio")
# Style the record button with the provided parameters
original_audio_bytes = audio_recorder(
text="Click to Record Audio",
recording_color="#e8b62c",
neutral_color="#6aa36f",
pause_threshold=30,
icon_name="microphone", # You can change this to any Font Awesome solid icon
icon_size="4x"
)
if not original_audio_bytes:
original_audio = st.file_uploader("Or Upload Original Audio", type=["wav", "mp3"])
if original_audio:
original_audio_bytes = original_audio.read()
if original_audio_bytes:
with st.spinner("Processing original audio..."):
st.audio(original_audio_bytes, format="audio/wav")
st.subheader("Record or Upload User Audio")
st.write("")
# Style the user audio recorder similarly
user_audio_bytes = audio_recorder(
text="Click to Record Audio",
recording_color="#e86f6f",
neutral_color="#6a6faf",
pause_threshold=30,
icon_name="user", # You can change this to any Font Awesome solid icon
icon_size="4x"
)
if not user_audio_bytes:
user_audio = st.file_uploader("Or Upload User Audio", type=["wav", "mp3"])
if user_audio:
user_audio_bytes = user_audio.read()
if user_audio_bytes:
with st.spinner("Processing user audio..."):
st.audio(user_audio_bytes, format="audio/wav")
# Add a button to perform the test
if original_audio_bytes and user_audio_bytes:
if st.button("Perform Testing"):
with st.spinner("Performing transcription and similarity testing..."):
transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)
# Display results
st.markdown("---")
st.subheader("Transcriptions and Similarity Score")
st.write(f"**Original Transcription:** {transcription_original}")
st.write(f"**User Transcription:** {transcription_user}")
st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
if similarity_score > 0.8: # Adjust the threshold as needed
st.success("The pronunciation is likely correct based on transcription similarity.")
else:
st.error("The pronunciation may be incorrect based on transcription similarity.")
|