Athspi commited on
Commit
7fe102a
·
verified ·
1 Parent(s): b25ba0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -31
app.py CHANGED
@@ -1,36 +1,196 @@
1
  import gradio as gr
2
- from faster_whisper import WhisperModel
 
 
3
 
4
- # Load the Faster Whisper model
5
- model = WhisperModel("large-v3", device="cpu") # Use "cuda" for GPU acceleration
 
 
 
 
 
 
6
 
7
- # Define the transcription function
8
- def transcribe_audio(audio_file):
9
- """
10
- Transcribes the audio file using the Faster Whisper model.
11
- """
12
- try:
13
- segments, info = model.transcribe(audio_file, beam_size=5) # Adjust beam_size as needed
14
- transcription = "\n".join(
15
- [f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}" for segment in segments]
16
- )
17
- return transcription
18
- except Exception as e:
19
- return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- # Create the Gradio interface
22
- interface = gr.Interface(
23
- fn=transcribe_audio, # Function to process the input
24
- inputs=gr.Audio(type="filepath", label="Upload Audio"), # Corrected input component
25
- outputs=gr.Textbox(label="Transcription"), # Output: Textbox for the transcription
26
- title="Audio-to-Text Transcription",
27
- description=(
28
- "Upload an audio file and get the transcription using the Faster Whisper model "
29
- "large-v3. Supports high-quality transcription with beam search."
30
- ),
31
- allow_flagging="never",
32
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- # Launch the Gradio app
35
- if __name__ == "__main__":
36
- interface.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
1
  import gradio as gr
2
+ import whisper
3
+ import os
4
+ from pydub import AudioSegment
5
 
6
+ # Mapping of model names to Whisper model sizes
7
+ MODELS = {
8
+ "Tiny (Fastest)": "tiny",
9
+ "Base (Faster)": "base",
10
+ "Small (Balanced)": "small",
11
+ "Medium (Accurate)": "medium",
12
+ "Large (Most Accurate)": "large"
13
+ }
14
 
15
+ # Mapping of full language names to language codes
16
+ LANGUAGE_NAME_TO_CODE = {
17
+ "Auto Detect": "Auto Detect",
18
+ "English": "en",
19
+ "Chinese": "zh",
20
+ "German": "de",
21
+ "Spanish": "es",
22
+ "Russian": "ru",
23
+ "Korean": "ko",
24
+ "French": "fr",
25
+ "Japanese": "ja",
26
+ "Portuguese": "pt",
27
+ "Turkish": "tr",
28
+ "Polish": "pl",
29
+ "Catalan": "ca",
30
+ "Dutch": "nl",
31
+ "Arabic": "ar",
32
+ "Swedish": "sv",
33
+ "Italian": "it",
34
+ "Indonesian": "id",
35
+ "Hindi": "hi",
36
+ "Finnish": "fi",
37
+ "Vietnamese": "vi",
38
+ "Hebrew": "he",
39
+ "Ukrainian": "uk",
40
+ "Greek": "el",
41
+ "Malay": "ms",
42
+ "Czech": "cs",
43
+ "Romanian": "ro",
44
+ "Danish": "da",
45
+ "Hungarian": "hu",
46
+ "Tamil": "ta",
47
+ "Norwegian": "no",
48
+ "Thai": "th",
49
+ "Urdu": "ur",
50
+ "Croatian": "hr",
51
+ "Bulgarian": "bg",
52
+ "Lithuanian": "lt",
53
+ "Latin": "la",
54
+ "Maori": "mi",
55
+ "Malayalam": "ml",
56
+ "Welsh": "cy",
57
+ "Slovak": "sk",
58
+ "Telugu": "te",
59
+ "Persian": "fa",
60
+ "Latvian": "lv",
61
+ "Bengali": "bn",
62
+ "Serbian": "sr",
63
+ "Azerbaijani": "az",
64
+ "Slovenian": "sl",
65
+ "Kannada": "kn",
66
+ "Estonian": "et",
67
+ "Macedonian": "mk",
68
+ "Breton": "br",
69
+ "Basque": "eu",
70
+ "Icelandic": "is",
71
+ "Armenian": "hy",
72
+ "Nepali": "ne",
73
+ "Mongolian": "mn",
74
+ "Bosnian": "bs",
75
+ "Kazakh": "kk",
76
+ "Albanian": "sq",
77
+ "Swahili": "sw",
78
+ "Galician": "gl",
79
+ "Marathi": "mr",
80
+ "Punjabi": "pa",
81
+ "Sinhala": "si", # Sinhala support
82
+ "Khmer": "km",
83
+ "Shona": "sn",
84
+ "Yoruba": "yo",
85
+ "Somali": "so",
86
+ "Afrikaans": "af",
87
+ "Occitan": "oc",
88
+ "Georgian": "ka",
89
+ "Belarusian": "be",
90
+ "Tajik": "tg",
91
+ "Sindhi": "sd",
92
+ "Gujarati": "gu",
93
+ "Amharic": "am",
94
+ "Yiddish": "yi",
95
+ "Lao": "lo",
96
+ "Uzbek": "uz",
97
+ "Faroese": "fo",
98
+ "Haitian Creole": "ht",
99
+ "Pashto": "ps",
100
+ "Turkmen": "tk",
101
+ "Nynorsk": "nn",
102
+ "Maltese": "mt",
103
+ "Sanskrit": "sa",
104
+ "Luxembourgish": "lb",
105
+ "Burmese": "my",
106
+ "Tibetan": "bo",
107
+ "Tagalog": "tl",
108
+ "Malagasy": "mg",
109
+ "Assamese": "as",
110
+ "Tatar": "tt",
111
+ "Hawaiian": "haw",
112
+ "Lingala": "ln",
113
+ "Hausa": "ha",
114
+ "Bashkir": "ba",
115
+ "Javanese": "jw",
116
+ "Sundanese": "su",
117
+ }
118
+
119
+ def detect_language(audio_file):
120
+ """Detect the language of the audio file."""
121
+ # Load the Whisper model (use "base" for faster detection)
122
+ model = whisper.load_model("base")
123
+
124
+ # Convert audio to 16kHz mono for better compatibility with Whisper
125
+ audio = AudioSegment.from_file(audio_file)
126
+ audio = audio.set_frame_rate(16000).set_channels(1)
127
+ processed_audio_path = "processed_audio.wav"
128
+ audio.export(processed_audio_path, format="wav")
129
+
130
+ # Detect the language
131
+ result = model.transcribe(processed_audio_path, task="detect_language", fp16=False)
132
+ detected_language = result.get("language", "unknown")
133
+
134
+ # Clean up processed audio file
135
+ os.remove(processed_audio_path)
136
+
137
+ return f"Detected Language: {detected_language}"
138
+
139
+ def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
140
+ """Transcribe the audio file."""
141
+ # Load the selected Whisper model
142
+ model = whisper.load_model(MODELS[model_size])
143
+
144
+ # Convert audio to 16kHz mono for better compatibility with Whisper
145
+ audio = AudioSegment.from_file(audio_file)
146
+ audio = audio.set_frame_rate(16000).set_channels(1)
147
+ processed_audio_path = "processed_audio.wav"
148
+ audio.export(processed_audio_path, format="wav")
149
+
150
+ # Transcribe the audio
151
+ if language == "Auto Detect":
152
+ result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
153
+ detected_language = result.get("language", "unknown")
154
+ else:
155
+ language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
156
+ result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
157
+ detected_language = language_code
158
+
159
+ # Clean up processed audio file
160
+ os.remove(processed_audio_path)
161
+
162
+ # Return transcription and detected language
163
+ return f"Detected Language: {detected_language}\n\nTranscription:\n{result['text']}"
164
 
165
+ # Define the Gradio interface
166
+ with gr.Blocks() as demo:
167
+ gr.Markdown("# Audio Transcription and Language Detection")
168
+
169
+ with gr.Tab("Detect Language"):
170
+ gr.Markdown("Upload an audio file to detect its language.")
171
+ detect_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
172
+ detect_language_output = gr.Textbox(label="Detected Language")
173
+ detect_button = gr.Button("Detect Language")
174
+
175
+ with gr.Tab("Transcribe Audio"):
176
+ gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
177
+ transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
178
+ language_dropdown = gr.Dropdown(
179
+ choices=list(LANGUAGE_NAME_TO_CODE.keys()), # Full language names
180
+ label="Select Language",
181
+ value="Auto Detect"
182
+ )
183
+ model_dropdown = gr.Dropdown(
184
+ choices=list(MODELS.keys()), # Model options
185
+ label="Select Model",
186
+ value="Base (Faster)" # Default to "Base" model
187
+ )
188
+ transcribe_output = gr.Textbox(label="Transcription and Detected Language")
189
+ transcribe_button = gr.Button("Transcribe Audio")
190
+
191
+ # Link buttons to functions
192
+ detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
193
+ transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
194
 
195
+ # Launch the Gradio interface
196
+ demo.launch()