roman commited on
Commit
f9a3e58
·
1 Parent(s): 186a3b7

try new approach

Browse files
Files changed (1) hide show
  1. app.py +21 -48
app.py CHANGED
@@ -1,61 +1,34 @@
1
  import streamlit as st
 
2
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
3
- import tempfile
4
- from pydub import AudioSegment
5
- import numpy as np
6
- import torch
7
 
8
 
9
- # Load the processor and model
10
- @st.cache_resource
11
- def load_model_and_processor():
12
- processor = AutoProcessor.from_pretrained("arampacha/whisper-large-uk-2")
13
- model = AutoModelForSpeechSeq2Seq.from_pretrained("arampacha/whisper-large-uk-2")
14
- return processor, model
15
 
 
16
 
17
- st.title("Voice Recognition App using Whisper")
18
 
19
- st.write("Upload an audio file and the Whisper model will transcribe it to text.")
20
 
21
- # Load the processor and model
22
- processor, model = load_model_and_processor()
23
- st.write("Model loaded successfully.")
24
 
25
- # File uploader for audio file
26
- uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
 
 
 
27
 
 
28
  if uploaded_file is not None:
29
- # Save the uploaded file temporarily
30
- with tempfile.NamedTemporaryFile(delete=False) as temp_file:
31
- temp_file.write(uploaded_file.read())
32
- temp_file_path = temp_file.name
33
 
34
- # Convert audio file to WAV format if necessary
35
- audio = AudioSegment.from_file(temp_file_path)
36
- temp_wav_path = tempfile.mktemp(suffix=".wav")
37
- audio.export(temp_wav_path, format="wav")
38
 
39
- st.audio(uploaded_file, format="audio/wav")
40
-
41
- st.write("Transcribing audio...")
42
-
43
- # Read the audio file
44
- audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
45
- audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32)
46
-
47
- # Normalize audio
48
- audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input)
49
- audio_input = torch.tensor(audio_input).unsqueeze(0)
50
-
51
- # Process the audio
52
- input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
53
-
54
- # Generate transcription
55
- with torch.no_grad():
56
- predicted_ids = model.generate(input_features)
57
-
58
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
59
-
60
- st.write("Transcription:")
61
- st.write(transcription)
 
1
  import streamlit as st
2
+ import librosa
3
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 
 
 
 
4
 
5
 
 
 
 
 
 
 
6
 
7
+ uploaded_file = st.file_uploader("上传文件", type="wav")
8
 
9
+ processor = AutoProcessor.from_pretrained("Yehor/whisper-small-ukrainian")
10
 
11
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("Yehor/whisper-small-ukrainian")
12
 
13
+ def map_to_pred(file_path):
14
+ # load audio file
15
+ audio, _ = librosa.load(file_path)
16
 
17
+ # preprocess audio and generate standard
18
+ input_features = processor([audio], return_tensors="pt", sampling_rate=16_000).input_features
19
+ generated_ids = model.generate(inputs=input_features)
20
+ transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True)
21
+ text = processor.tokenizer._normalize(transcription[0])
22
 
23
+ return text
24
  if uploaded_file is not None:
25
+ # convert file object to file path
26
+ file_path = './temp.wav'
27
+ with open(file_path, 'wb') as f:
28
+ f.write(uploaded_file.getbuffer())
29
 
30
+ text = map_to_pred(file_path)
 
 
 
31
 
32
+ # display results
33
+ st.write('Input audio:', uploaded_file.name)
34
+ st.write('Predicted standard:', text)