|
""" |
|
Advanced Speech Recognition Module for Multilingual Audio Intelligence System |
|
|
|
This module implements state-of-the-art automatic speech recognition using openai-whisper |
|
with integrated language identification capabilities. Designed for maximum performance |
|
on CPU-constrained environments while maintaining SOTA accuracy. |
|
|
|
Key Features: |
|
- OpenAI Whisper with optimized backend for speed improvement |
|
- Integrated Language Identification (no separate LID module needed) |
|
- VAD-based batching for real-time performance on CPU |
|
- Word-level timestamps for interactive UI synchronization |
|
- Robust error handling and multilingual support |
|
- CPU and GPU optimization paths |
|
|
|
Model: openai/whisper-small (optimized for speed/accuracy balance) |
|
Dependencies: openai-whisper, torch, numpy |
|
""" |
|
|
|
import os |
|
import logging |
|
import warnings |
|
import numpy as np |
|
import torch |
|
from typing import List, Dict, Optional, Tuple, Union |
|
import tempfile |
|
from dataclasses import dataclass |
|
import time |
|
|
|
try: |
|
import whisper |
|
WHISPER_AVAILABLE = True |
|
except ImportError: |
|
WHISPER_AVAILABLE = False |
|
logging.warning("openai-whisper not available. Install with: pip install openai-whisper") |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
warnings.filterwarnings("ignore", category=UserWarning) |
|
warnings.filterwarnings("ignore", category=FutureWarning) |
|
|
|
|
|
@dataclass |
|
class TranscriptionSegment: |
|
""" |
|
Data class representing a transcribed speech segment with rich metadata. |
|
""" |
|
start: float |
|
end: float |
|
text: str |
|
language: str |
|
language_probability: float |
|
no_speech_probability: float |
|
words: Optional[List[Dict]] = None |
|
speaker_id: Optional[str] = None |
|
confidence: Optional[float] = None |
|
word_timestamps: Optional[List[Dict]] = None |
|
|
|
|
|
class SpeechRecognizer: |
|
""" |
|
Advanced Speech Recognition Engine using OpenAI Whisper. |
|
|
|
This class provides high-performance speech recognition with integrated language |
|
identification, optimized for both CPU and GPU environments. |
|
""" |
|
|
|
def __init__(self, model_size: str = "small", device: str = "auto", |
|
compute_type: str = "int8", language: Optional[str] = None): |
|
""" |
|
Initialize the Speech Recognizer. |
|
|
|
Args: |
|
model_size: Whisper model size (tiny, base, small, medium, large) |
|
device: Device to use (auto, cpu, cuda) |
|
compute_type: Computation precision (int8, float16, float32) |
|
language: Target language code (None for auto-detection) |
|
""" |
|
self.model_size = model_size |
|
self.device = self._determine_device(device) |
|
self.compute_type = compute_type |
|
self.language = language |
|
self.model = None |
|
self._initialize_model() |
|
|
|
def _determine_device(self, device: str) -> str: |
|
"""Determine the best available device.""" |
|
if device == "auto": |
|
if torch.cuda.is_available(): |
|
return "cuda" |
|
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): |
|
return "mps" |
|
else: |
|
return "cpu" |
|
return device |
|
|
|
def _initialize_model(self): |
|
"""Initialize the Whisper model.""" |
|
if not WHISPER_AVAILABLE: |
|
raise ImportError("openai-whisper is required. Install with: pip install openai-whisper") |
|
|
|
try: |
|
logger.info(f"Loading {self.model_size} Whisper model...") |
|
self.model = whisper.load_model(self.model_size, device=self.device) |
|
logger.info(f"Speech recognition models loaded on {self.device}") |
|
except Exception as e: |
|
logger.error(f"Failed to load Whisper model: {e}") |
|
raise |
|
|
|
def transcribe_audio(self, audio_data: np.ndarray, sample_rate: int = 16000, |
|
language: Optional[str] = None, |
|
initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]: |
|
""" |
|
Transcribe audio data with language identification. |
|
|
|
Args: |
|
audio_data: Audio data as numpy array |
|
sample_rate: Sample rate of the audio |
|
language: Language code (None for auto-detection) |
|
initial_prompt: Initial prompt for better transcription |
|
|
|
Returns: |
|
List of TranscriptionSegment objects |
|
""" |
|
if self.model is None: |
|
raise RuntimeError("Model not initialized") |
|
|
|
try: |
|
|
|
if sample_rate != 16000: |
|
import librosa |
|
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
|
|
|
|
|
result = self.model.transcribe( |
|
audio_data, |
|
language=language or self.language, |
|
initial_prompt=initial_prompt, |
|
word_timestamps=True, |
|
verbose=False |
|
) |
|
|
|
|
|
segments = [] |
|
for segment in result["segments"]: |
|
words = [] |
|
if "words" in segment: |
|
for word in segment["words"]: |
|
words.append({ |
|
"word": word["word"], |
|
"start": word["start"], |
|
"end": word["end"], |
|
"probability": word.get("probability", 1.0) |
|
}) |
|
|
|
segments.append(TranscriptionSegment( |
|
start=segment["start"], |
|
end=segment["end"], |
|
text=segment["text"].strip(), |
|
language=result.get("language", "unknown"), |
|
language_probability=result.get("language_probability", 1.0), |
|
no_speech_probability=segment.get("no_speech_prob", 0.0), |
|
words=words, |
|
speaker_id=None, |
|
confidence=1.0 - segment.get("no_speech_prob", 0.0), |
|
word_timestamps=words |
|
)) |
|
|
|
return segments |
|
|
|
except Exception as e: |
|
logger.error(f"Transcription failed: {e}") |
|
raise |
|
|
|
def transcribe_file(self, file_path: str, language: Optional[str] = None, |
|
initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]: |
|
""" |
|
Transcribe an audio file. |
|
|
|
Args: |
|
file_path: Path to audio file |
|
language: Language code (None for auto-detection) |
|
initial_prompt: Initial prompt for better transcription |
|
|
|
Returns: |
|
List of TranscriptionSegment objects |
|
""" |
|
try: |
|
|
|
import librosa |
|
audio_data, sample_rate = librosa.load(file_path, sr=16000) |
|
|
|
return self.transcribe_audio(audio_data, sample_rate, language, initial_prompt) |
|
|
|
except Exception as e: |
|
logger.error(f"File transcription failed: {e}") |
|
raise |
|
|
|
def transcribe_segments(self, audio_data: np.ndarray, sample_rate: int, |
|
speaker_segments: List[Tuple[float, float, str]], |
|
word_timestamps: bool = True) -> List[TranscriptionSegment]: |
|
""" |
|
Transcribe audio segments with speaker information. |
|
|
|
Args: |
|
audio_data: Audio data as numpy array |
|
sample_rate: Sample rate of the audio |
|
speaker_segments: List of (start_time, end_time, speaker_id) tuples |
|
word_timestamps: Whether to include word-level timestamps |
|
|
|
Returns: |
|
List of TranscriptionSegment objects with speaker information |
|
""" |
|
if self.model is None: |
|
raise RuntimeError("Model not initialized") |
|
|
|
try: |
|
|
|
if sample_rate != 16000: |
|
import librosa |
|
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
|
|
|
|
|
result = self.model.transcribe( |
|
audio_data, |
|
language=self.language, |
|
word_timestamps=word_timestamps, |
|
verbose=False |
|
) |
|
|
|
|
|
segments = [] |
|
for segment in result["segments"]: |
|
|
|
speaker_id = "Unknown" |
|
for start_time, end_time, spk_id in speaker_segments: |
|
if (segment["start"] >= start_time and segment["end"] <= end_time): |
|
speaker_id = spk_id |
|
break |
|
|
|
words = [] |
|
if word_timestamps and "words" in segment: |
|
for word in segment["words"]: |
|
words.append({ |
|
"word": word["word"], |
|
"start": word["start"], |
|
"end": word["end"], |
|
"probability": word.get("probability", 1.0) |
|
}) |
|
|
|
segments.append(TranscriptionSegment( |
|
start=segment["start"], |
|
end=segment["end"], |
|
text=segment["text"].strip(), |
|
language=result.get("language", "unknown"), |
|
language_probability=result.get("language_probability", 1.0), |
|
no_speech_probability=segment.get("no_speech_prob", 0.0), |
|
words=words, |
|
speaker_id=speaker_id, |
|
confidence=1.0 - segment.get("no_speech_prob", 0.0), |
|
word_timestamps=words |
|
)) |
|
|
|
return segments |
|
|
|
except Exception as e: |
|
logger.error(f"Segment transcription failed: {e}") |
|
raise |
|
|
|
def get_supported_languages(self) -> List[str]: |
|
"""Get list of supported language codes.""" |
|
return [ |
|
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su" |
|
] |
|
|
|
def detect_language(self, audio_data: np.ndarray, sample_rate: int = 16000) -> Tuple[str, float]: |
|
""" |
|
Detect the language of audio data. |
|
|
|
Args: |
|
audio_data: Audio data as numpy array |
|
sample_rate: Sample rate of the audio |
|
|
|
Returns: |
|
Tuple of (language_code, confidence) |
|
""" |
|
try: |
|
|
|
if sample_rate != 16000: |
|
import librosa |
|
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
|
|
|
|
|
result = self.model.transcribe(audio_data, language=None, verbose=False) |
|
|
|
return result.get("language", "unknown"), result.get("language_probability", 0.0) |
|
|
|
except Exception as e: |
|
logger.error(f"Language detection failed: {e}") |
|
return "unknown", 0.0 |
|
|
|
|
|
def create_speech_recognizer(model_size: str = "small", device: str = "auto", |
|
compute_type: str = "int8", language: Optional[str] = None) -> SpeechRecognizer: |
|
""" |
|
Factory function to create a SpeechRecognizer instance. |
|
|
|
Args: |
|
model_size: Whisper model size |
|
device: Device to use |
|
compute_type: Computation precision |
|
language: Target language code |
|
|
|
Returns: |
|
SpeechRecognizer instance |
|
""" |
|
return SpeechRecognizer(model_size, device, compute_type, language) |