Spaces:

prathameshv07
/

Multilingual-Audio-Intelligence-System

Sleeping

Multilingual-Audio-Intelligence-System / src /speech_recognizer.py

Prathamesh Sarjerao Vaidya

made changes to fix docker deployement issue

fdcc0cf 10 days ago

12.6 kB

	"""
	Advanced Speech Recognition Module for Multilingual Audio Intelligence System

	This module implements state-of-the-art automatic speech recognition using openai-whisper
	with integrated language identification capabilities. Designed for maximum performance
	on CPU-constrained environments while maintaining SOTA accuracy.

	Key Features:
	- OpenAI Whisper with optimized backend for speed improvement
	- Integrated Language Identification (no separate LID module needed)
	- VAD-based batching for real-time performance on CPU
	- Word-level timestamps for interactive UI synchronization
	- Robust error handling and multilingual support
	- CPU and GPU optimization paths

	Model: openai/whisper-small (optimized for speed/accuracy balance)
	Dependencies: openai-whisper, torch, numpy
	"""

	import os
	import logging
	import warnings
	import numpy as np
	import torch
	from typing import List, Dict, Optional, Tuple, Union
	import tempfile
	from dataclasses import dataclass
	import time

	try:
	import whisper
	WHISPER_AVAILABLE = True
	except ImportError:
	WHISPER_AVAILABLE = False
	logging.warning("openai-whisper not available. Install with: pip install openai-whisper")

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Suppress warnings for cleaner output
	warnings.filterwarnings("ignore", category=UserWarning)
	warnings.filterwarnings("ignore", category=FutureWarning)


	@dataclass
	class TranscriptionSegment:
	"""
	Data class representing a transcribed speech segment with rich metadata.
	"""
	start: float
	end: float
	text: str
	language: str
	language_probability: float
	no_speech_probability: float
	words: Optional[List[Dict]] = None
	speaker_id: Optional[str] = None
	confidence: Optional[float] = None
	word_timestamps: Optional[List[Dict]] = None


	class SpeechRecognizer:
	"""
	Advanced Speech Recognition Engine using OpenAI Whisper.

	This class provides high-performance speech recognition with integrated language
	identification, optimized for both CPU and GPU environments.
	"""

	def __init__(self, model_size: str = "small", device: str = "auto",
	compute_type: str = "int8", language: Optional[str] = None):
	"""
	Initialize the Speech Recognizer.

	Args:
	model_size: Whisper model size (tiny, base, small, medium, large)
	device: Device to use (auto, cpu, cuda)
	compute_type: Computation precision (int8, float16, float32)
	language: Target language code (None for auto-detection)
	"""
	self.model_size = model_size
	self.device = self._determine_device(device)
	self.compute_type = compute_type
	self.language = language
	self.model = None
	self._initialize_model()

	def _determine_device(self, device: str) -> str:
	"""Determine the best available device."""
	if device == "auto":
	if torch.cuda.is_available():
	return "cuda"
	elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
	return "mps"
	else:
	return "cpu"
	return device

	def _initialize_model(self):
	"""Initialize the Whisper model."""
	if not WHISPER_AVAILABLE:
	raise ImportError("openai-whisper is required. Install with: pip install openai-whisper")

	try:
	logger.info(f"Loading {self.model_size} Whisper model...")
	self.model = whisper.load_model(self.model_size, device=self.device)
	logger.info(f"Speech recognition models loaded on {self.device}")
	except Exception as e:
	logger.error(f"Failed to load Whisper model: {e}")
	raise

	def transcribe_audio(self, audio_data: np.ndarray, sample_rate: int = 16000,
	language: Optional[str] = None,
	initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]:
	"""
	Transcribe audio data with language identification.

	Args:
	audio_data: Audio data as numpy array
	sample_rate: Sample rate of the audio
	language: Language code (None for auto-detection)
	initial_prompt: Initial prompt for better transcription

	Returns:
	List of TranscriptionSegment objects
	"""
	if self.model is None:
	raise RuntimeError("Model not initialized")

	try:
	# Prepare audio for Whisper (expects 16kHz)
	if sample_rate != 16000:
	import librosa
	audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

	# Transcribe with Whisper
	result = self.model.transcribe(
	audio_data,
	language=language or self.language,
	initial_prompt=initial_prompt,
	word_timestamps=True,
	verbose=False
	)

	# Convert to our format
	segments = []
	for segment in result["segments"]:
	words = []
	if "words" in segment:
	for word in segment["words"]:
	words.append({
	"word": word["word"],
	"start": word["start"],
	"end": word["end"],
	"probability": word.get("probability", 1.0)
	})

	segments.append(TranscriptionSegment(
	start=segment["start"],
	end=segment["end"],
	text=segment["text"].strip(),
	language=result.get("language", "unknown"),
	language_probability=result.get("language_probability", 1.0),
	no_speech_probability=segment.get("no_speech_prob", 0.0),
	words=words,
	speaker_id=None,
	confidence=1.0 - segment.get("no_speech_prob", 0.0),
	word_timestamps=words
	))

	return segments

	except Exception as e:
	logger.error(f"Transcription failed: {e}")
	raise

	def transcribe_file(self, file_path: str, language: Optional[str] = None,
	initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]:
	"""
	Transcribe an audio file.

	Args:
	file_path: Path to audio file
	language: Language code (None for auto-detection)
	initial_prompt: Initial prompt for better transcription

	Returns:
	List of TranscriptionSegment objects
	"""
	try:
	# Load audio file
	import librosa
	audio_data, sample_rate = librosa.load(file_path, sr=16000)

	return self.transcribe_audio(audio_data, sample_rate, language, initial_prompt)

	except Exception as e:
	logger.error(f"File transcription failed: {e}")
	raise

	def transcribe_segments(self, audio_data: np.ndarray, sample_rate: int,
	speaker_segments: List[Tuple[float, float, str]],
	word_timestamps: bool = True) -> List[TranscriptionSegment]:
	"""
	Transcribe audio segments with speaker information.

	Args:
	audio_data: Audio data as numpy array
	sample_rate: Sample rate of the audio
	speaker_segments: List of (start_time, end_time, speaker_id) tuples
	word_timestamps: Whether to include word-level timestamps

	Returns:
	List of TranscriptionSegment objects with speaker information
	"""
	if self.model is None:
	raise RuntimeError("Model not initialized")

	try:
	# Prepare audio for Whisper (expects 16kHz)
	if sample_rate != 16000:
	import librosa
	audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

	# Transcribe the entire audio first
	result = self.model.transcribe(
	audio_data,
	language=self.language,
	word_timestamps=word_timestamps,
	verbose=False
	)

	# Convert to our format and add speaker information
	segments = []
	for segment in result["segments"]:
	# Find the speaker for this segment
	speaker_id = "Unknown"
	for start_time, end_time, spk_id in speaker_segments:
	if (segment["start"] >= start_time and segment["end"] <= end_time):
	speaker_id = spk_id
	break

	words = []
	if word_timestamps and "words" in segment:
	for word in segment["words"]:
	words.append({
	"word": word["word"],
	"start": word["start"],
	"end": word["end"],
	"probability": word.get("probability", 1.0)
	})

	segments.append(TranscriptionSegment(
	start=segment["start"],
	end=segment["end"],
	text=segment["text"].strip(),
	language=result.get("language", "unknown"),
	language_probability=result.get("language_probability", 1.0),
	no_speech_probability=segment.get("no_speech_prob", 0.0),
	words=words,
	speaker_id=speaker_id, # Add speaker information
	confidence=1.0 - segment.get("no_speech_prob", 0.0),
	word_timestamps=words
	))

	return segments

	except Exception as e:
	logger.error(f"Segment transcription failed: {e}")
	raise

	def get_supported_languages(self) -> List[str]:
	"""Get list of supported language codes."""
	return [
	"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su"
	]

	def detect_language(self, audio_data: np.ndarray, sample_rate: int = 16000) -> Tuple[str, float]:
	"""
	Detect the language of audio data.

	Args:
	audio_data: Audio data as numpy array
	sample_rate: Sample rate of the audio

	Returns:
	Tuple of (language_code, confidence)
	"""
	try:
	# Prepare audio for Whisper
	if sample_rate != 16000:
	import librosa
	audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

	# Detect language using Whisper
	result = self.model.transcribe(audio_data, language=None, verbose=False)

	return result.get("language", "unknown"), result.get("language_probability", 0.0)

	except Exception as e:
	logger.error(f"Language detection failed: {e}")
	return "unknown", 0.0


	def create_speech_recognizer(model_size: str = "small", device: str = "auto",
	compute_type: str = "int8", language: Optional[str] = None) -> SpeechRecognizer:
	"""
	Factory function to create a SpeechRecognizer instance.

	Args:
	model_size: Whisper model size
	device: Device to use
	compute_type: Computation precision
	language: Target language code

	Returns:
	SpeechRecognizer instance
	"""
	return SpeechRecognizer(model_size, device, compute_type, language)