Athspi commited on
Commit
0a51f5f
·
verified ·
1 Parent(s): 948133d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -69
app.py CHANGED
@@ -3,7 +3,6 @@ import whisper
3
  import torch
4
  import os
5
  from pydub import AudioSegment
6
- from transformers import pipeline
7
  from faster_whisper import WhisperModel # Import faster-whisper
8
 
9
  # Mapping of model names to Whisper model sizes
@@ -13,20 +12,7 @@ MODELS = {
13
  "Small (Balanced)": "small",
14
  "Medium (Accurate)": "medium",
15
  "Large (Most Accurate)": "large",
16
- "Systran Faster Whisper Large v3": "Systran/faster-whisper-large-v3" # Add the new model
17
- }
18
-
19
- # Fine-tuned models for specific languages
20
- FINE_TUNED_MODELS = {
21
- "Tamil": {
22
- "model": "vasista22/whisper-tamil-medium",
23
- "language": "ta"
24
- },
25
- "Sinhala": {
26
- "model": "Subhaka/whisper-small-Sinhala-Fine_Tune", # Add the new fine-tuned model
27
- "language": "si" # Sinhala language code
28
- },
29
- # Add more fine-tuned models for other languages here
30
  }
31
 
32
  # Mapping of full language names to language codes
@@ -138,18 +124,22 @@ CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
138
 
139
  def detect_language(audio_file):
140
  """Detect the language of the audio file."""
141
- # Load the Whisper model (use "base" for faster detection)
142
- model = whisper.load_model("base")
 
143
 
144
- # Convert audio to 16kHz mono for better compatibility with Whisper
 
 
 
145
  audio = AudioSegment.from_file(audio_file)
146
  audio = audio.set_frame_rate(16000).set_channels(1)
147
  processed_audio_path = "processed_audio.wav"
148
  audio.export(processed_audio_path, format="wav")
149
 
150
- # Detect the language
151
- result = model.transcribe(processed_audio_path, task="detect_language", fp16=False)
152
- detected_language_code = result.get("language", "unknown")
153
 
154
  # Get the full language name from the code
155
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
@@ -159,7 +149,7 @@ def detect_language(audio_file):
159
 
160
  return f"Detected Language: {detected_language}"
161
 
162
- def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
163
  """Transcribe the audio file."""
164
  # Convert audio to 16kHz mono for better compatibility
165
  audio = AudioSegment.from_file(audio_file)
@@ -168,56 +158,38 @@ def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faste
168
  audio.export(processed_audio_path, format="wav")
169
 
170
  # Load the appropriate model
171
- if language in FINE_TUNED_MODELS:
172
- # Use the fine-tuned Whisper model for the selected language
173
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
174
- transcribe = pipeline(
175
- task="automatic-speech-recognition",
176
- model=FINE_TUNED_MODELS[language]["model"],
177
- chunk_length_s=30,
178
- device=device
 
 
 
 
 
179
  )
180
- transcribe.model.config.forced_decoder_ids = transcribe.tokenizer.get_decoder_prompt_ids(
181
- language=FINE_TUNED_MODELS[language]["language"],
182
- task="transcribe"
183
- )
184
- result = transcribe(processed_audio_path)
185
- transcription = result["text"]
186
- detected_language = language
187
  else:
188
- # Use the selected Whisper model
189
- if model_size == "Systran Faster Whisper Large v3":
190
- # Define device and compute type for faster-whisper
191
- device = "cuda" if torch.cuda.is_available() else "cpu"
192
- compute_type = "float32" if device == "cuda" else "int8"
193
-
194
- # Use faster-whisper for the Systran model
195
- model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
196
- segments, info = model.transcribe(
197
- processed_audio_path,
198
- task="transcribe",
199
- word_timestamps=True,
200
- repetition_penalty=1.1,
201
- temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
202
- )
203
- transcription = " ".join([segment.text for segment in segments])
204
- detected_language_code = info.language
205
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
206
  else:
207
- # Use the standard Whisper model
208
- model = whisper.load_model(MODELS[model_size])
209
-
210
- # Transcribe the audio
211
- if language == "Auto Detect":
212
- result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
213
- detected_language_code = result.get("language", "unknown")
214
- detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
215
- else:
216
- language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
217
- result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
218
- detected_language = language
219
-
220
- transcription = result["text"]
221
 
222
  # Clean up processed audio file
223
  os.remove(processed_audio_path)
@@ -246,7 +218,7 @@ with gr.Blocks() as demo:
246
  model_dropdown = gr.Dropdown(
247
  choices=list(MODELS.keys()), # Model options
248
  label="Select Model",
249
- value="Base (Faster)", # Default to "Base" model
250
  interactive=True # Allow model selection by default
251
  )
252
  transcribe_output = gr.Textbox(label="Transcription and Detected Language")
 
3
  import torch
4
  import os
5
  from pydub import AudioSegment
 
6
  from faster_whisper import WhisperModel # Import faster-whisper
7
 
8
  # Mapping of model names to Whisper model sizes
 
12
  "Small (Balanced)": "small",
13
  "Medium (Accurate)": "medium",
14
  "Large (Most Accurate)": "large",
15
+ "Faster Whisper Large v3": "Systran/faster-whisper-large-v3" # Renamed and set as default
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  }
17
 
18
  # Mapping of full language names to language codes
 
124
 
125
  def detect_language(audio_file):
126
  """Detect the language of the audio file."""
127
+ # Define device and compute type for faster-whisper
128
+ device = "cuda" if torch.cuda.is_available() else "cpu"
129
+ compute_type = "float32" if device == "cuda" else "int8"
130
 
131
+ # Load the faster-whisper model for language detection
132
+ model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
133
+
134
+ # Convert audio to 16kHz mono for better compatibility
135
  audio = AudioSegment.from_file(audio_file)
136
  audio = audio.set_frame_rate(16000).set_channels(1)
137
  processed_audio_path = "processed_audio.wav"
138
  audio.export(processed_audio_path, format="wav")
139
 
140
+ # Detect the language using faster-whisper
141
+ segments, info = model.transcribe(processed_audio_path, task="translate", language=None)
142
+ detected_language_code = info.language
143
 
144
  # Get the full language name from the code
145
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
 
149
 
150
  return f"Detected Language: {detected_language}"
151
 
152
+ def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
153
  """Transcribe the audio file."""
154
  # Convert audio to 16kHz mono for better compatibility
155
  audio = AudioSegment.from_file(audio_file)
 
158
  audio.export(processed_audio_path, format="wav")
159
 
160
  # Load the appropriate model
161
+ if model_size == "Faster Whisper Large v3":
162
+ # Define device and compute type for faster-whisper
163
+ device = "cuda" if torch.cuda.is_available() else "cpu"
164
+ compute_type = "float32" if device == "cuda" else "int8"
165
+
166
+ # Use faster-whisper for the Systran model
167
+ model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
168
+ segments, info = model.transcribe(
169
+ processed_audio_path,
170
+ task="transcribe",
171
+ word_timestamps=True,
172
+ repetition_penalty=1.1,
173
+ temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
174
  )
175
+ transcription = " ".join([segment.text for segment in segments])
176
+ detected_language_code = info.language
177
+ detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
 
 
 
 
178
  else:
179
+ # Use the standard Whisper model
180
+ model = whisper.load_model(MODELS[model_size])
181
+
182
+ # Transcribe the audio
183
+ if language == "Auto Detect":
184
+ result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
185
+ detected_language_code = result.get("language", "unknown")
 
 
 
 
 
 
 
 
 
 
186
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
187
  else:
188
+ language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
189
+ result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
190
+ detected_language = language
191
+
192
+ transcription = result["text"]
 
 
 
 
 
 
 
 
 
193
 
194
  # Clean up processed audio file
195
  os.remove(processed_audio_path)
 
218
  model_dropdown = gr.Dropdown(
219
  choices=list(MODELS.keys()), # Model options
220
  label="Select Model",
221
+ value="Faster Whisper Large v3", # Default to "Faster Whisper Large v3"
222
  interactive=True # Allow model selection by default
223
  )
224
  transcribe_output = gr.Textbox(label="Transcription and Detected Language")