File size: 7,487 Bytes
3e27995
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""
Quality Control Module for Audio Intelligence System

This module implements quality checks and model selection strategies
to ensure the system only demonstrates its best capabilities.
"""

import logging
from typing import Dict, List, Optional, Tuple
import re

logger = logging.getLogger(__name__)

class QualityController:
    """
    Controls quality of transcription and translation to avoid
    misleading results in demonstrations.
    """
    
    def __init__(self):
        # Languages where we have good model performance
        self.reliable_languages = {
            'hi': {'name': 'Hindi', 'opus_mt': True, 'quality': 'high'},
            'ja': {'name': 'Japanese', 'opus_mt': True, 'quality': 'high'},
            'fr': {'name': 'French', 'opus_mt': True, 'quality': 'high'},
            'en': {'name': 'English', 'opus_mt': True, 'quality': 'high'},
            'ur': {'name': 'Urdu', 'opus_mt': True, 'quality': 'medium'},
            'bn': {'name': 'Bengali', 'opus_mt': True, 'quality': 'medium'},
        }
        
        # Patterns that indicate poor transcription quality
        self.poor_quality_patterns = [
            r'^(.+?)\1{4,}',  # Repetitive patterns (word repeated 4+ times)
            r'^(तो\s*){10,}',  # Specific Hindi repetition issue
            r'^(.{1,3}\s*){20,}',  # Very short repeated phrases
        ]
    
    def validate_language_detection(self, text: str, detected_lang: str) -> Tuple[str, float]:
        """
        Validate language detection and return corrected language with confidence.
        
        Returns:
            Tuple[str, float]: (corrected_language, confidence)
        """
        # Clean text for analysis
        clean_text = text.strip()
        
        # Script-based detection for Indian languages
        devanagari_chars = sum(1 for char in clean_text if '\u0900' <= char <= '\u097F')
        arabic_chars = sum(1 for char in clean_text if '\u0600' <= char <= '\u06FF')
        latin_chars = sum(1 for char in clean_text if char.isascii() and char.isalpha())
        
        total_chars = len([c for c in clean_text if c.isalpha()])
        
        if total_chars == 0:
            return detected_lang, 0.1
        
        # Calculate script ratios
        devanagari_ratio = devanagari_chars / total_chars
        arabic_ratio = arabic_chars / total_chars
        latin_ratio = latin_chars / total_chars
        
        # High confidence script-based detection
        if devanagari_ratio > 0.8:
            return 'hi', 0.95
        elif arabic_ratio > 0.8:
            return 'ur', 0.9
        elif latin_ratio > 0.9:
            # Could be English, French, or romanized text
            if detected_lang in ['en', 'fr']:
                return detected_lang, 0.8
            return 'en', 0.7
        
        # Medium confidence corrections
        if devanagari_ratio > 0.5:
            return 'hi', 0.7
        elif arabic_ratio > 0.5:
            return 'ur', 0.7
        
        # If current detection is unreliable, default to Hindi for Indian audio
        if detected_lang in ['zh', 'th', 'ko'] and devanagari_ratio > 0.2:
            return 'hi', 0.6
        
        return detected_lang, 0.5
    
    def assess_transcription_quality(self, text: str) -> Dict[str, any]:
        """
        Assess the quality of transcribed text.
        
        Returns:
            Dict with quality assessment
        """
        clean_text = text.strip()
        words = clean_text.split()
        
        assessment = {
            'text': clean_text,
            'quality_score': 1.0,
            'issues': [],
            'recommendation': 'accept'
        }
        
        # Check text length
        if len(clean_text) < 5:
            assessment['quality_score'] *= 0.3
            assessment['issues'].append('very_short')
        
        if len(words) == 0:
            assessment['quality_score'] = 0.0
            assessment['issues'].append('empty')
            assessment['recommendation'] = 'reject'
            return assessment
        
        # Check for repetition
        unique_words = set(words)
        repetition_ratio = len(unique_words) / len(words)
        
        if repetition_ratio < 0.3:
            assessment['quality_score'] *= 0.2
            assessment['issues'].append('highly_repetitive')
            assessment['recommendation'] = 'filter'
        elif repetition_ratio < 0.5:
            assessment['quality_score'] *= 0.6
            assessment['issues'].append('repetitive')
        
        # Check for specific poor quality patterns
        for pattern in self.poor_quality_patterns:
            if re.match(pattern, clean_text):
                assessment['quality_score'] *= 0.1
                assessment['issues'].append('pattern_match')
                assessment['recommendation'] = 'reject'
                break
        
        # Check for garbled text (too many non-word characters)
        alpha_ratio = len([c for c in clean_text if c.isalpha()]) / max(1, len(clean_text))
        if alpha_ratio < 0.5:
            assessment['quality_score'] *= 0.4
            assessment['issues'].append('garbled')
        
        # Final recommendation
        if assessment['quality_score'] < 0.2:
            assessment['recommendation'] = 'reject'
        elif assessment['quality_score'] < 0.5:
            assessment['recommendation'] = 'filter'
        
        return assessment
    
    def should_process_language(self, language: str) -> bool:
        """
        Determine if we should process this language based on our capabilities.
        """
        return language in self.reliable_languages
    
    def get_best_translation_strategy(self, source_lang: str, target_lang: str) -> Dict[str, any]:
        """
        Get the best translation strategy for the language pair.
        """
        strategy = {
            'method': 'hybrid',
            'confidence': 0.5,
            'explanation': 'Standard hybrid approach'
        }
        
        if source_lang not in self.reliable_languages:
            strategy['method'] = 'google_only'
            strategy['confidence'] = 0.6
            strategy['explanation'] = f'Language {source_lang} not in reliable set, using Google API'
        elif self.reliable_languages[source_lang]['quality'] == 'high':
            strategy['confidence'] = 0.9
            strategy['explanation'] = f'High quality support for {source_lang}'
        
        return strategy
    
    def filter_results_for_demo(self, segments: List) -> List:
        """
        Filter results to show only high-quality segments for demo purposes.
        """
        filtered_segments = []
        
        for segment in segments:
            # Assess transcription quality
            quality = self.assess_transcription_quality(segment.original_text)
            
            if quality['recommendation'] == 'accept':
                filtered_segments.append(segment)
            elif quality['recommendation'] == 'filter':
                # Keep but mark as filtered
                segment.original_text = f"[Filtered] {segment.original_text}"
                segment.confidence_transcription *= 0.5
                filtered_segments.append(segment)
            # Skip 'reject' segments entirely
        
        logger.info(f"Quality filter: {len(segments)}{len(filtered_segments)} segments")
        return filtered_segments

# Global instance
quality_controller = QualityController()