Prathamesh Sarjerao Vaidya
commited on
Commit
·
fdcc0cf
1
Parent(s):
785835e
made changes to fix docker deployement issue
Browse files- Dockerfile +5 -1
- model_preloader.py +15 -14
- requirements.txt +5 -2
- src/speech_recognizer.py +78 -1
- startup.py +15 -2
- web_app.py +5 -2
Dockerfile
CHANGED
@@ -72,7 +72,11 @@ ENV PYTHONPATH=/app \
|
|
72 |
# Fix executable stack issues
|
73 |
ONNX_EXECUTION_PROVIDER=cpu \
|
74 |
# Disable problematic optimizations
|
75 |
-
OMP_NUM_THREADS=1
|
|
|
|
|
|
|
|
|
76 |
|
77 |
# Expose port for Hugging Face Spaces
|
78 |
EXPOSE 7860
|
|
|
72 |
# Fix executable stack issues
|
73 |
ONNX_EXECUTION_PROVIDER=cpu \
|
74 |
# Disable problematic optimizations
|
75 |
+
OMP_NUM_THREADS=1 \
|
76 |
+
# Suppress tensorboard warnings
|
77 |
+
TF_ENABLE_ONEDNN_OPTS=0 \
|
78 |
+
# Disable problematic features
|
79 |
+
DISABLE_ONNX_EXECUTION_PROVIDERS=CPUExecutionProvider
|
80 |
|
81 |
# Expose port for Hugging Face Spaces
|
82 |
EXPOSE 7860
|
model_preloader.py
CHANGED
@@ -266,7 +266,7 @@ class ModelPreloader:
|
|
266 |
torch_dtype=torch.float32 if self.device == "cpu" else torch.float16
|
267 |
)
|
268 |
|
269 |
-
console.print(f"[green]
|
270 |
|
271 |
except Exception as e:
|
272 |
console.print(f"[yellow]Local cache load failed for {model_name}, will download: {e}[/yellow]")
|
@@ -286,7 +286,7 @@ class ModelPreloader:
|
|
286 |
torch_dtype=torch.float32 if self.device == "cpu" else torch.float16
|
287 |
)
|
288 |
|
289 |
-
console.print(f"[green]
|
290 |
|
291 |
# Move to device if needed
|
292 |
if self.device != "cpu":
|
@@ -377,12 +377,12 @@ class ModelPreloader:
|
|
377 |
)
|
378 |
|
379 |
# Test the pipeline
|
380 |
-
console.print(f"[green]
|
381 |
|
382 |
return pipeline
|
383 |
|
384 |
except Exception as e:
|
385 |
-
console.print(f"[red]
|
386 |
logger.error(f"Pyannote loading failed: {e}")
|
387 |
return None
|
388 |
|
@@ -409,12 +409,12 @@ class ModelPreloader:
|
|
409 |
dummy_audio = np.zeros(16000, dtype=np.float32) # 1 second of silence
|
410 |
result = model.transcribe(dummy_audio, language="en")
|
411 |
|
412 |
-
console.print(f"[green]
|
413 |
|
414 |
return model
|
415 |
|
416 |
except Exception as e:
|
417 |
-
console.print(f"[red]
|
418 |
logger.error(f"Whisper loading failed: {e}")
|
419 |
return None
|
420 |
|
@@ -444,7 +444,7 @@ class ModelPreloader:
|
|
444 |
sys_info = self.get_system_info()
|
445 |
|
446 |
info_panel = Panel.fit(
|
447 |
-
f"""
|
448 |
|
449 |
• CPU Cores: {sys_info['cpu_count']}
|
450 |
• Total Memory: {sys_info['memory_gb']} GB
|
@@ -487,7 +487,7 @@ class ModelPreloader:
|
|
487 |
|
488 |
# Check cache first
|
489 |
if self.check_model_cache(model_key):
|
490 |
-
console.print(f"[green]
|
491 |
progress.update(task_id, completed=100)
|
492 |
progress.update(main_task, advance=1)
|
493 |
results["models"][model_key] = {"status": "cached", "time": 0}
|
@@ -541,13 +541,13 @@ class ModelPreloader:
|
|
541 |
# Summary
|
542 |
console.print()
|
543 |
if results["success_count"] == results["total_count"]:
|
544 |
-
status_text = "[bold green]
|
545 |
status_color = "green"
|
546 |
elif results["success_count"] > 0:
|
547 |
-
status_text = f"[bold yellow]
|
548 |
status_color = "yellow"
|
549 |
else:
|
550 |
-
status_text = "[bold red]
|
551 |
status_color = "red"
|
552 |
|
553 |
summary_panel = Panel.fit(
|
@@ -577,8 +577,9 @@ class ModelPreloader:
|
|
577 |
|
578 |
def main():
|
579 |
"""Main function to run model preloading."""
|
|
|
580 |
console.print(Panel.fit(
|
581 |
-
"[bold blue]
|
582 |
border_style="blue"
|
583 |
))
|
584 |
console.print()
|
@@ -591,11 +592,11 @@ def main():
|
|
591 |
results = preloader.preload_all_models()
|
592 |
|
593 |
if results["success_count"] > 0:
|
594 |
-
console.print("\n[bold green]
|
595 |
console.print(f"[dim]Models cached in: {preloader.cache_dir}[/dim]")
|
596 |
return True
|
597 |
else:
|
598 |
-
console.print("\n[bold red]
|
599 |
return False
|
600 |
|
601 |
except KeyboardInterrupt:
|
|
|
266 |
torch_dtype=torch.float32 if self.device == "cpu" else torch.float16
|
267 |
)
|
268 |
|
269 |
+
console.print(f"[green]SUCCESS: Successfully loaded {model_name} from local cache[/green]")
|
270 |
|
271 |
except Exception as e:
|
272 |
console.print(f"[yellow]Local cache load failed for {model_name}, will download: {e}[/yellow]")
|
|
|
286 |
torch_dtype=torch.float32 if self.device == "cpu" else torch.float16
|
287 |
)
|
288 |
|
289 |
+
console.print(f"[green]SUCCESS: Successfully downloaded and loaded {model_name}[/green]")
|
290 |
|
291 |
# Move to device if needed
|
292 |
if self.device != "cpu":
|
|
|
377 |
)
|
378 |
|
379 |
# Test the pipeline
|
380 |
+
console.print(f"[green]SUCCESS: pyannote.audio pipeline loaded successfully on {self.device}[/green]")
|
381 |
|
382 |
return pipeline
|
383 |
|
384 |
except Exception as e:
|
385 |
+
console.print(f"[red]ERROR: Failed to load pyannote.audio pipeline: {e}[/red]")
|
386 |
logger.error(f"Pyannote loading failed: {e}")
|
387 |
return None
|
388 |
|
|
|
409 |
dummy_audio = np.zeros(16000, dtype=np.float32) # 1 second of silence
|
410 |
result = model.transcribe(dummy_audio, language="en")
|
411 |
|
412 |
+
console.print(f"[green]SUCCESS: Whisper model loaded successfully on {self.device}[/green]")
|
413 |
|
414 |
return model
|
415 |
|
416 |
except Exception as e:
|
417 |
+
console.print(f"[red]ERROR: Failed to load Whisper model: {e}[/red]")
|
418 |
logger.error(f"Whisper loading failed: {e}")
|
419 |
return None
|
420 |
|
|
|
444 |
sys_info = self.get_system_info()
|
445 |
|
446 |
info_panel = Panel.fit(
|
447 |
+
f"""System Information
|
448 |
|
449 |
• CPU Cores: {sys_info['cpu_count']}
|
450 |
• Total Memory: {sys_info['memory_gb']} GB
|
|
|
487 |
|
488 |
# Check cache first
|
489 |
if self.check_model_cache(model_key):
|
490 |
+
console.print(f"[green]SUCCESS: {config['description']} found in cache[/green]")
|
491 |
progress.update(task_id, completed=100)
|
492 |
progress.update(main_task, advance=1)
|
493 |
results["models"][model_key] = {"status": "cached", "time": 0}
|
|
|
541 |
# Summary
|
542 |
console.print()
|
543 |
if results["success_count"] == results["total_count"]:
|
544 |
+
status_text = "[bold green]SUCCESS: All models loaded successfully![/bold green]"
|
545 |
status_color = "green"
|
546 |
elif results["success_count"] > 0:
|
547 |
+
status_text = f"[bold yellow]WARNING: {results['success_count']}/{results['total_count']} models loaded[/bold yellow]"
|
548 |
status_color = "yellow"
|
549 |
else:
|
550 |
+
status_text = "[bold red]ERROR: No models loaded successfully[/bold red]"
|
551 |
status_color = "red"
|
552 |
|
553 |
summary_panel = Panel.fit(
|
|
|
577 |
|
578 |
def main():
|
579 |
"""Main function to run model preloading."""
|
580 |
+
# Use ASCII-safe characters for Windows compatibility
|
581 |
console.print(Panel.fit(
|
582 |
+
"[bold blue]Multilingual Audio Intelligence System[/bold blue]\n[yellow]Model Preloader[/yellow]",
|
583 |
border_style="blue"
|
584 |
))
|
585 |
console.print()
|
|
|
592 |
results = preloader.preload_all_models()
|
593 |
|
594 |
if results["success_count"] > 0:
|
595 |
+
console.print("\n[bold green]SUCCESS: Model preloading completed![/bold green]")
|
596 |
console.print(f"[dim]Models cached in: {preloader.cache_dir}[/dim]")
|
597 |
return True
|
598 |
else:
|
599 |
+
console.print("\n[bold red]ERROR: Model preloading failed![/bold red]")
|
600 |
return False
|
601 |
|
602 |
except KeyboardInterrupt:
|
requirements.txt
CHANGED
@@ -25,11 +25,11 @@ pyannote.metrics==3.2.1
|
|
25 |
# Performance & Optimization
|
26 |
numba==0.58.1
|
27 |
# Use CPU-only onnxruntime to avoid executable stack issues
|
28 |
-
onnxruntime
|
29 |
accelerate==0.20.3
|
30 |
|
31 |
# Core Utilities
|
32 |
-
numpy
|
33 |
psutil==5.9.6
|
34 |
python-dotenv==1.0.0
|
35 |
requests==2.31.0
|
@@ -56,6 +56,7 @@ deep-translator==1.11.4
|
|
56 |
# Scientific Computing
|
57 |
scipy==1.11.4
|
58 |
matplotlib==3.7.3
|
|
|
59 |
scikit-learn==1.3.2
|
60 |
|
61 |
# PS-6 Specific Dependencies (HF Spaces compatible)
|
@@ -73,6 +74,8 @@ rich==13.7.0
|
|
73 |
|
74 |
# Machine Learning
|
75 |
tensorflow==2.15.0
|
|
|
|
|
76 |
|
77 |
# Additional Dependencies
|
78 |
huggingface-hub==0.16.4
|
|
|
25 |
# Performance & Optimization
|
26 |
numba==0.58.1
|
27 |
# Use CPU-only onnxruntime to avoid executable stack issues
|
28 |
+
onnxruntime==1.16.3
|
29 |
accelerate==0.20.3
|
30 |
|
31 |
# Core Utilities
|
32 |
+
numpy
|
33 |
psutil==5.9.6
|
34 |
python-dotenv==1.0.0
|
35 |
requests==2.31.0
|
|
|
56 |
# Scientific Computing
|
57 |
scipy==1.11.4
|
58 |
matplotlib==3.7.3
|
59 |
+
plotly==5.17.0
|
60 |
scikit-learn==1.3.2
|
61 |
|
62 |
# PS-6 Specific Dependencies (HF Spaces compatible)
|
|
|
74 |
|
75 |
# Machine Learning
|
76 |
tensorflow==2.15.0
|
77 |
+
# Fix tensorboard compatibility
|
78 |
+
tensorboard==2.15.2
|
79 |
|
80 |
# Additional Dependencies
|
81 |
huggingface-hub==0.16.4
|
src/speech_recognizer.py
CHANGED
@@ -55,6 +55,9 @@ class TranscriptionSegment:
|
|
55 |
language_probability: float
|
56 |
no_speech_probability: float
|
57 |
words: Optional[List[Dict]] = None
|
|
|
|
|
|
|
58 |
|
59 |
|
60 |
class SpeechRecognizer:
|
@@ -160,7 +163,10 @@ class SpeechRecognizer:
|
|
160 |
language=result.get("language", "unknown"),
|
161 |
language_probability=result.get("language_probability", 1.0),
|
162 |
no_speech_probability=segment.get("no_speech_prob", 0.0),
|
163 |
-
words=words
|
|
|
|
|
|
|
164 |
))
|
165 |
|
166 |
return segments
|
@@ -193,6 +199,77 @@ class SpeechRecognizer:
|
|
193 |
logger.error(f"File transcription failed: {e}")
|
194 |
raise
|
195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
def get_supported_languages(self) -> List[str]:
|
197 |
"""Get list of supported language codes."""
|
198 |
return [
|
|
|
55 |
language_probability: float
|
56 |
no_speech_probability: float
|
57 |
words: Optional[List[Dict]] = None
|
58 |
+
speaker_id: Optional[str] = None
|
59 |
+
confidence: Optional[float] = None
|
60 |
+
word_timestamps: Optional[List[Dict]] = None
|
61 |
|
62 |
|
63 |
class SpeechRecognizer:
|
|
|
163 |
language=result.get("language", "unknown"),
|
164 |
language_probability=result.get("language_probability", 1.0),
|
165 |
no_speech_probability=segment.get("no_speech_prob", 0.0),
|
166 |
+
words=words,
|
167 |
+
speaker_id=None,
|
168 |
+
confidence=1.0 - segment.get("no_speech_prob", 0.0),
|
169 |
+
word_timestamps=words
|
170 |
))
|
171 |
|
172 |
return segments
|
|
|
199 |
logger.error(f"File transcription failed: {e}")
|
200 |
raise
|
201 |
|
202 |
+
def transcribe_segments(self, audio_data: np.ndarray, sample_rate: int,
|
203 |
+
speaker_segments: List[Tuple[float, float, str]],
|
204 |
+
word_timestamps: bool = True) -> List[TranscriptionSegment]:
|
205 |
+
"""
|
206 |
+
Transcribe audio segments with speaker information.
|
207 |
+
|
208 |
+
Args:
|
209 |
+
audio_data: Audio data as numpy array
|
210 |
+
sample_rate: Sample rate of the audio
|
211 |
+
speaker_segments: List of (start_time, end_time, speaker_id) tuples
|
212 |
+
word_timestamps: Whether to include word-level timestamps
|
213 |
+
|
214 |
+
Returns:
|
215 |
+
List of TranscriptionSegment objects with speaker information
|
216 |
+
"""
|
217 |
+
if self.model is None:
|
218 |
+
raise RuntimeError("Model not initialized")
|
219 |
+
|
220 |
+
try:
|
221 |
+
# Prepare audio for Whisper (expects 16kHz)
|
222 |
+
if sample_rate != 16000:
|
223 |
+
import librosa
|
224 |
+
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
225 |
+
|
226 |
+
# Transcribe the entire audio first
|
227 |
+
result = self.model.transcribe(
|
228 |
+
audio_data,
|
229 |
+
language=self.language,
|
230 |
+
word_timestamps=word_timestamps,
|
231 |
+
verbose=False
|
232 |
+
)
|
233 |
+
|
234 |
+
# Convert to our format and add speaker information
|
235 |
+
segments = []
|
236 |
+
for segment in result["segments"]:
|
237 |
+
# Find the speaker for this segment
|
238 |
+
speaker_id = "Unknown"
|
239 |
+
for start_time, end_time, spk_id in speaker_segments:
|
240 |
+
if (segment["start"] >= start_time and segment["end"] <= end_time):
|
241 |
+
speaker_id = spk_id
|
242 |
+
break
|
243 |
+
|
244 |
+
words = []
|
245 |
+
if word_timestamps and "words" in segment:
|
246 |
+
for word in segment["words"]:
|
247 |
+
words.append({
|
248 |
+
"word": word["word"],
|
249 |
+
"start": word["start"],
|
250 |
+
"end": word["end"],
|
251 |
+
"probability": word.get("probability", 1.0)
|
252 |
+
})
|
253 |
+
|
254 |
+
segments.append(TranscriptionSegment(
|
255 |
+
start=segment["start"],
|
256 |
+
end=segment["end"],
|
257 |
+
text=segment["text"].strip(),
|
258 |
+
language=result.get("language", "unknown"),
|
259 |
+
language_probability=result.get("language_probability", 1.0),
|
260 |
+
no_speech_probability=segment.get("no_speech_prob", 0.0),
|
261 |
+
words=words,
|
262 |
+
speaker_id=speaker_id, # Add speaker information
|
263 |
+
confidence=1.0 - segment.get("no_speech_prob", 0.0),
|
264 |
+
word_timestamps=words
|
265 |
+
))
|
266 |
+
|
267 |
+
return segments
|
268 |
+
|
269 |
+
except Exception as e:
|
270 |
+
logger.error(f"Segment transcription failed: {e}")
|
271 |
+
raise
|
272 |
+
|
273 |
def get_supported_languages(self) -> List[str]:
|
274 |
"""Get list of supported language codes."""
|
275 |
return [
|
startup.py
CHANGED
@@ -58,12 +58,21 @@ def preload_models():
|
|
58 |
import model_preloader
|
59 |
logger.info('✅ Model preloader module found')
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
# Try to run the preloader
|
62 |
result = subprocess.run(
|
63 |
['python', 'model_preloader.py'],
|
64 |
capture_output=True,
|
65 |
text=True,
|
66 |
-
timeout=300 # 5 minute timeout
|
|
|
67 |
)
|
68 |
|
69 |
if result.returncode == 0:
|
@@ -74,7 +83,11 @@ def preload_models():
|
|
74 |
else:
|
75 |
logger.warning(f'⚠️ Model preloading failed with return code {result.returncode}')
|
76 |
if result.stderr:
|
77 |
-
|
|
|
|
|
|
|
|
|
78 |
return False
|
79 |
|
80 |
except subprocess.TimeoutExpired:
|
|
|
58 |
import model_preloader
|
59 |
logger.info('✅ Model preloader module found')
|
60 |
|
61 |
+
# Set environment variables to handle onnxruntime issues
|
62 |
+
env = os.environ.copy()
|
63 |
+
env.update({
|
64 |
+
'ONNX_EXECUTION_PROVIDER': 'cpu',
|
65 |
+
'DISABLE_ONNX_EXECUTION_PROVIDERS': 'CPUExecutionProvider',
|
66 |
+
'TF_ENABLE_ONEDNN_OPTS': '0'
|
67 |
+
})
|
68 |
+
|
69 |
# Try to run the preloader
|
70 |
result = subprocess.run(
|
71 |
['python', 'model_preloader.py'],
|
72 |
capture_output=True,
|
73 |
text=True,
|
74 |
+
timeout=300, # 5 minute timeout
|
75 |
+
env=env
|
76 |
)
|
77 |
|
78 |
if result.returncode == 0:
|
|
|
83 |
else:
|
84 |
logger.warning(f'⚠️ Model preloading failed with return code {result.returncode}')
|
85 |
if result.stderr:
|
86 |
+
# Check if it's the onnxruntime issue
|
87 |
+
if 'cannot enable executable stack' in result.stderr:
|
88 |
+
logger.warning('⚠️ ONNX Runtime executable stack issue detected - this is expected in containers')
|
89 |
+
else:
|
90 |
+
logger.warning(f'Preloader stderr: {result.stderr[:500]}...')
|
91 |
return False
|
92 |
|
93 |
except subprocess.TimeoutExpired:
|
web_app.py
CHANGED
@@ -44,8 +44,11 @@ import numpy as np
|
|
44 |
import pandas as pd
|
45 |
from dotenv import load_dotenv
|
46 |
|
47 |
-
# Load environment variables
|
48 |
-
|
|
|
|
|
|
|
49 |
|
50 |
# Add src directory to Python path
|
51 |
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
|
|
44 |
import pandas as pd
|
45 |
from dotenv import load_dotenv
|
46 |
|
47 |
+
# Load environment variables with error handling
|
48 |
+
try:
|
49 |
+
load_dotenv()
|
50 |
+
except Exception as e:
|
51 |
+
logging.warning(f"Could not load .env file: {e}")
|
52 |
|
53 |
# Add src directory to Python path
|
54 |
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|