|
""" |
|
Quality Control Module for Audio Intelligence System |
|
|
|
This module implements quality checks and model selection strategies |
|
to ensure the system only demonstrates its best capabilities. |
|
""" |
|
|
|
import logging |
|
from typing import Dict, List, Optional, Tuple |
|
import re |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class QualityController: |
|
""" |
|
Controls quality of transcription and translation to avoid |
|
misleading results in demonstrations. |
|
""" |
|
|
|
def __init__(self): |
|
|
|
self.reliable_languages = { |
|
'hi': {'name': 'Hindi', 'opus_mt': True, 'quality': 'high'}, |
|
'ja': {'name': 'Japanese', 'opus_mt': True, 'quality': 'high'}, |
|
'fr': {'name': 'French', 'opus_mt': True, 'quality': 'high'}, |
|
'en': {'name': 'English', 'opus_mt': True, 'quality': 'high'}, |
|
'ur': {'name': 'Urdu', 'opus_mt': True, 'quality': 'medium'}, |
|
'bn': {'name': 'Bengali', 'opus_mt': True, 'quality': 'medium'}, |
|
} |
|
|
|
|
|
self.poor_quality_patterns = [ |
|
r'^(.+?)\1{4,}', |
|
r'^(तो\s*){10,}', |
|
r'^(.{1,3}\s*){20,}', |
|
] |
|
|
|
def validate_language_detection(self, text: str, detected_lang: str) -> Tuple[str, float]: |
|
""" |
|
Validate language detection and return corrected language with confidence. |
|
|
|
Returns: |
|
Tuple[str, float]: (corrected_language, confidence) |
|
""" |
|
|
|
clean_text = text.strip() |
|
|
|
|
|
devanagari_chars = sum(1 for char in clean_text if '\u0900' <= char <= '\u097F') |
|
arabic_chars = sum(1 for char in clean_text if '\u0600' <= char <= '\u06FF') |
|
latin_chars = sum(1 for char in clean_text if char.isascii() and char.isalpha()) |
|
|
|
total_chars = len([c for c in clean_text if c.isalpha()]) |
|
|
|
if total_chars == 0: |
|
return detected_lang, 0.1 |
|
|
|
|
|
devanagari_ratio = devanagari_chars / total_chars |
|
arabic_ratio = arabic_chars / total_chars |
|
latin_ratio = latin_chars / total_chars |
|
|
|
|
|
if devanagari_ratio > 0.8: |
|
return 'hi', 0.95 |
|
elif arabic_ratio > 0.8: |
|
return 'ur', 0.9 |
|
elif latin_ratio > 0.9: |
|
|
|
if detected_lang in ['en', 'fr']: |
|
return detected_lang, 0.8 |
|
return 'en', 0.7 |
|
|
|
|
|
if devanagari_ratio > 0.5: |
|
return 'hi', 0.7 |
|
elif arabic_ratio > 0.5: |
|
return 'ur', 0.7 |
|
|
|
|
|
if detected_lang in ['zh', 'th', 'ko'] and devanagari_ratio > 0.2: |
|
return 'hi', 0.6 |
|
|
|
return detected_lang, 0.5 |
|
|
|
def assess_transcription_quality(self, text: str) -> Dict[str, any]: |
|
""" |
|
Assess the quality of transcribed text. |
|
|
|
Returns: |
|
Dict with quality assessment |
|
""" |
|
clean_text = text.strip() |
|
words = clean_text.split() |
|
|
|
assessment = { |
|
'text': clean_text, |
|
'quality_score': 1.0, |
|
'issues': [], |
|
'recommendation': 'accept' |
|
} |
|
|
|
|
|
if len(clean_text) < 5: |
|
assessment['quality_score'] *= 0.3 |
|
assessment['issues'].append('very_short') |
|
|
|
if len(words) == 0: |
|
assessment['quality_score'] = 0.0 |
|
assessment['issues'].append('empty') |
|
assessment['recommendation'] = 'reject' |
|
return assessment |
|
|
|
|
|
unique_words = set(words) |
|
repetition_ratio = len(unique_words) / len(words) |
|
|
|
if repetition_ratio < 0.3: |
|
assessment['quality_score'] *= 0.2 |
|
assessment['issues'].append('highly_repetitive') |
|
assessment['recommendation'] = 'filter' |
|
elif repetition_ratio < 0.5: |
|
assessment['quality_score'] *= 0.6 |
|
assessment['issues'].append('repetitive') |
|
|
|
|
|
for pattern in self.poor_quality_patterns: |
|
if re.match(pattern, clean_text): |
|
assessment['quality_score'] *= 0.1 |
|
assessment['issues'].append('pattern_match') |
|
assessment['recommendation'] = 'reject' |
|
break |
|
|
|
|
|
alpha_ratio = len([c for c in clean_text if c.isalpha()]) / max(1, len(clean_text)) |
|
if alpha_ratio < 0.5: |
|
assessment['quality_score'] *= 0.4 |
|
assessment['issues'].append('garbled') |
|
|
|
|
|
if assessment['quality_score'] < 0.2: |
|
assessment['recommendation'] = 'reject' |
|
elif assessment['quality_score'] < 0.5: |
|
assessment['recommendation'] = 'filter' |
|
|
|
return assessment |
|
|
|
def should_process_language(self, language: str) -> bool: |
|
""" |
|
Determine if we should process this language based on our capabilities. |
|
""" |
|
return language in self.reliable_languages |
|
|
|
def get_best_translation_strategy(self, source_lang: str, target_lang: str) -> Dict[str, any]: |
|
""" |
|
Get the best translation strategy for the language pair. |
|
""" |
|
strategy = { |
|
'method': 'hybrid', |
|
'confidence': 0.5, |
|
'explanation': 'Standard hybrid approach' |
|
} |
|
|
|
if source_lang not in self.reliable_languages: |
|
strategy['method'] = 'google_only' |
|
strategy['confidence'] = 0.6 |
|
strategy['explanation'] = f'Language {source_lang} not in reliable set, using Google API' |
|
elif self.reliable_languages[source_lang]['quality'] == 'high': |
|
strategy['confidence'] = 0.9 |
|
strategy['explanation'] = f'High quality support for {source_lang}' |
|
|
|
return strategy |
|
|
|
def filter_results_for_demo(self, segments: List) -> List: |
|
""" |
|
Filter results to show only high-quality segments for demo purposes. |
|
""" |
|
filtered_segments = [] |
|
|
|
for segment in segments: |
|
|
|
quality = self.assess_transcription_quality(segment.original_text) |
|
|
|
if quality['recommendation'] == 'accept': |
|
filtered_segments.append(segment) |
|
elif quality['recommendation'] == 'filter': |
|
|
|
segment.original_text = f"[Filtered] {segment.original_text}" |
|
segment.confidence_transcription *= 0.5 |
|
filtered_segments.append(segment) |
|
|
|
|
|
logger.info(f"Quality filter: {len(segments)} → {len(filtered_segments)} segments") |
|
return filtered_segments |
|
|
|
|
|
quality_controller = QualityController() |
|
|
|
|
|
|