Tristan Yu commited on
Commit
cf3775c
·
verified ·
1 Parent(s): e84ef07

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ static/icon.ico filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ # Create a non-root user
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ ENV HOME=/home/user \
7
+ PATH=/home/user/.local/bin:$PATH
8
+
9
+ WORKDIR $HOME/app
10
+
11
+ # Install system dependencies
12
+ RUN apt-get update && \
13
+ apt-get install -y --no-install-recommends \
14
+ build-essential \
15
+ git \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ # Copy requirements first
19
+ COPY --chown=user requirements.txt .
20
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
21
+
22
+ # Copy the rest of the application
23
+ COPY --chown=user . .
24
+
25
+ # Set environment variables
26
+ ENV HF_TOKEN=$HF_TOKEN
27
+ ENV PORT=7860
28
+
29
+ # Make port 7860 available (Hugging Face Spaces default)
30
+ EXPOSE 7860
31
+
32
+ # Command to run the application
33
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quality Lens - Translation Quality Estimation and Hallucination Detection
4
+ This script provides a web interface for evaluating translation quality and detecting hallucinations.
5
+ Features:
6
+ - Translation quality assessment
7
+ - Semantic equivalence analysis
8
+ - Hallucination detection
9
+ - COMET-QE integration
10
+ """
11
+
12
+ import os
13
+ import multiprocessing as mp
14
+ from typing import Optional, Dict, Any, List, cast
15
+ from flask import Flask, render_template, request, jsonify
16
+ from blaser_sonar_space import BLASEREvaluator
17
+ from comet import download_model, load_from_checkpoint
18
+ from comet.models import RegressionMetric
19
+ from dotenv import load_dotenv
20
+ from huggingface_hub import login
21
+ import pytorch_lightning as pl
22
+ import torch
23
+ import traceback
24
+ import sys
25
+
26
+ # Load environment variables
27
+ load_dotenv()
28
+
29
+ # Set tokenizer parallelism to avoid deadlocks
30
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
31
+
32
+ # Set multiprocessing start method to 'fork' on Unix systems
33
+ if os.name != 'nt': # Not Windows
34
+ mp.set_start_method('fork', force=True)
35
+
36
+ # Login to Hugging Face Hub if token is available
37
+ if os.getenv("HUGGINGFACE_TOKEN"):
38
+ login(token=os.getenv("HUGGINGFACE_TOKEN"))
39
+
40
+ app = Flask(__name__)
41
+
42
+ class COMETEvaluator:
43
+ """COMET-QE evaluator using the wmt22-comet-da model."""
44
+
45
+ def __init__(self):
46
+ self.model = None
47
+ self.model_name = "Unbabel/wmt22-comet-da" # Using the latest recommended model
48
+
49
+ def initialize(self) -> bool:
50
+ """Initialize the COMET model."""
51
+ try:
52
+ print(f"🔄 Downloading COMET model {self.model_name}...")
53
+ model_path = download_model(self.model_name)
54
+ print("✨ Loading COMET model...")
55
+
56
+ # Load and initialize the model
57
+ self.model = cast(RegressionMetric, load_from_checkpoint(model_path))
58
+
59
+ # Set model to evaluation mode
60
+ self.model.eval()
61
+
62
+ # Create a trainer with simplified configuration
63
+ self.model.trainer = pl.Trainer(
64
+ accelerator="mps" if torch.backends.mps.is_available() else "cpu",
65
+ devices=1,
66
+ enable_progress_bar=True, # Enable to see progress
67
+ logger=False,
68
+ strategy="auto"
69
+ )
70
+
71
+ # Move model to appropriate device
72
+ device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
73
+ print(f"🖥️ Using device: {device}")
74
+ self.model = self.model.to(device)
75
+
76
+ return True
77
+ except Exception as e:
78
+ print(f"❌ Failed to initialize COMET model: {str(e)}")
79
+ traceback.print_exc() # Print full traceback
80
+ return False
81
+
82
+ def evaluate(self, source: str, hypothesis: str, reference: str) -> Optional[Dict[str, Any]]:
83
+ """
84
+ Evaluate translation quality using COMET.
85
+
86
+ Args:
87
+ source: Source text
88
+ hypothesis: Translation to evaluate
89
+ reference: Reference translation
90
+
91
+ Returns:
92
+ Dictionary containing evaluation results or None if evaluation fails
93
+ """
94
+ if self.model is None:
95
+ print("❌ COMET model not initialized")
96
+ return None
97
+
98
+ try:
99
+ print("📊 Starting COMET evaluation...")
100
+
101
+ # Prepare data in COMET format
102
+ data = [{
103
+ "src": source,
104
+ "mt": hypothesis,
105
+ "ref": reference
106
+ }]
107
+
108
+ print("🔄 Data prepared, running prediction...")
109
+
110
+ # Set environment variables for multiprocessing
111
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
112
+
113
+ # Call predict method with minimal configuration
114
+ model_output = self.model.predict(
115
+ samples=data,
116
+ batch_size=1,
117
+ num_workers=1 # Use 1 worker to enable multiprocessing
118
+ )
119
+
120
+ print(f"✨ Raw model output: {model_output}")
121
+
122
+ # Get the system score from the output
123
+ if isinstance(model_output, dict):
124
+ comet_score = float(model_output.get('system_score', 0.0))
125
+ else:
126
+ # Try to get the first score if it's a list/tuple
127
+ try:
128
+ comet_score = float(model_output[0])
129
+ except (IndexError, TypeError, ValueError) as e:
130
+ print(f"❌ Could not extract score from COMET output: {str(e)}")
131
+ return None
132
+
133
+ print(f"📈 COMET score: {comet_score}")
134
+
135
+ # Map COMET score to BLASER range (1-5)
136
+ mapped_score = 1 + 4 / (1 + torch.exp(torch.tensor(-comet_score)))
137
+
138
+ result = {
139
+ "comet_score": comet_score,
140
+ "mapped_score": float(mapped_score),
141
+ "quality_assessment": "Good" if mapped_score >= 3.5 else "Poor"
142
+ }
143
+
144
+ print(f"✅ Evaluation complete: {result}")
145
+ return result
146
+
147
+ except Exception as e:
148
+ print(f"❌ COMET evaluation failed: {str(e)}")
149
+ traceback.print_exc() # Print full traceback
150
+ return None
151
+
152
+ # Initialize evaluators
153
+ print("🚀 Initializing Quality Lens components...")
154
+ blaser_evaluator = BLASEREvaluator()
155
+ if not blaser_evaluator.initialize():
156
+ raise RuntimeError("Failed to initialize BLASER evaluator")
157
+ print("✅ BLASER initialized successfully!")
158
+
159
+ print("🚀 Initializing COMET-QE...")
160
+ comet_evaluator = COMETEvaluator()
161
+ if not comet_evaluator.initialize():
162
+ raise RuntimeError("Failed to initialize COMET evaluator")
163
+ print("✅ COMET-QE initialized successfully!")
164
+
165
+ def evaluate_translation(source_text: str, translation_text: str) -> Dict[str, Any]:
166
+ """Evaluate translation quality using BLASER and COMET."""
167
+ try:
168
+ print("📊 Evaluating translation...")
169
+
170
+ # Get BLASER score
171
+ blaser_score = blaser_evaluator.evaluate(source_text, translation_text)
172
+
173
+ # Get COMET score
174
+ comet_result = comet_evaluator.evaluate(
175
+ source_text,
176
+ translation_text,
177
+ "" # Reference translation is not available in this context
178
+ )
179
+
180
+ if comet_result is None:
181
+ raise Exception("COMET evaluation failed")
182
+
183
+ # Extract scores
184
+ comet_score = comet_result["comet_score"]
185
+ comet_mapped_score = comet_result["mapped_score"]
186
+
187
+ # Get quality assessment
188
+ quality = get_quality_assessment(
189
+ blaser_score=blaser_score,
190
+ comet_score=comet_mapped_score
191
+ )
192
+
193
+ return {
194
+ "success": True,
195
+ "blaser_score": round(blaser_score, 3),
196
+ "comet_score": round(comet_mapped_score, 3),
197
+ "raw_comet_score": round(comet_score, 3),
198
+ "quality_assessment": quality
199
+ }
200
+
201
+ except Exception as e:
202
+ print(f"❌ Translation evaluation failed: {e}")
203
+ return {
204
+ "success": False,
205
+ "error": str(e)
206
+ }
207
+
208
+ def get_quality_assessment(blaser_score: float, comet_score: Optional[float] = None) -> Dict[str, Any]:
209
+ """Get quality assessment based on BLASER and COMET scores."""
210
+
211
+ # Map BLASER score to quality level
212
+ if blaser_score >= 4.5:
213
+ blaser_quality = "Excellent"
214
+ elif blaser_score >= 4.0:
215
+ blaser_quality = "Very Good"
216
+ elif blaser_score >= 3.5:
217
+ blaser_quality = "Good"
218
+ elif blaser_score >= 3.0:
219
+ blaser_quality = "Fair"
220
+ else:
221
+ blaser_quality = "Poor"
222
+
223
+ # If COMET score is available, map it to quality level
224
+ comet_quality = None
225
+ if comet_score is not None:
226
+ if comet_score >= 4.5:
227
+ comet_quality = "Excellent"
228
+ elif comet_score >= 4.0:
229
+ comet_quality = "Very Good"
230
+ elif comet_score >= 3.5:
231
+ comet_quality = "Good"
232
+ elif comet_score >= 3.0:
233
+ comet_quality = "Fair"
234
+ else:
235
+ comet_quality = "Poor"
236
+
237
+ # Determine confidence level based on multiple factors
238
+ confidence = "High"
239
+ if comet_quality and comet_score is not None:
240
+ score_diff = abs(blaser_score - comet_score)
241
+ avg_score = (blaser_score + comet_score) / 2
242
+
243
+ # Very high confidence when scores agree and are in good ranges
244
+ if score_diff < 0.2 and avg_score >= 4.0:
245
+ confidence = "Very High"
246
+ # High confidence when scores are similar and acceptable
247
+ elif score_diff < 0.3 and avg_score >= 3.5:
248
+ confidence = "High"
249
+ # Medium confidence when scores differ moderately or are in mediocre range
250
+ elif score_diff < 0.5 or avg_score >= 3.0:
251
+ confidence = "Medium"
252
+ # Low confidence when scores differ significantly or are poor
253
+ else:
254
+ confidence = "Low"
255
+
256
+ return {
257
+ "quality_level": blaser_quality,
258
+ "comet_quality_level": comet_quality,
259
+ "confidence": confidence,
260
+ "explanation": get_quality_explanation(blaser_quality, comet_quality, confidence)
261
+ }
262
+
263
+ def get_quality_explanation(blaser_quality: str, comet_quality: Optional[str], confidence: str) -> str:
264
+ """Generate explanation for quality assessment."""
265
+ if not comet_quality:
266
+ return f"BLASER rates this translation as {blaser_quality}."
267
+
268
+ if blaser_quality == comet_quality:
269
+ if confidence == "Very High":
270
+ return f"Both BLASER and COMET strongly agree that this translation is {blaser_quality}."
271
+ else:
272
+ return f"Both BLASER and COMET agree that this translation is {blaser_quality}."
273
+
274
+ if confidence == "Low":
275
+ return f"There is significant disagreement between metrics: BLASER rates it as {blaser_quality} while COMET rates it as {comet_quality}."
276
+
277
+ return f"BLASER rates this translation as {blaser_quality}, while COMET rates it as {comet_quality}. Consider reviewing for potential issues."
278
+
279
+ @app.route('/')
280
+ def index():
281
+ """Render the main page"""
282
+ return render_template('index.html')
283
+
284
+ @app.route('/evaluate', methods=['POST'])
285
+ def evaluate():
286
+ """Handle translation evaluation requests"""
287
+ try:
288
+ # Get form data
289
+ source_text: Optional[str] = request.form.get('source_text')
290
+ translation_text: Optional[str] = request.form.get('translation_text')
291
+ source_lang: Optional[str] = request.form.get('source_lang')
292
+ target_lang: Optional[str] = request.form.get('target_lang')
293
+
294
+ # Validate input
295
+ if not all([source_text, translation_text, source_lang, target_lang]):
296
+ return jsonify({
297
+ 'error': 'Missing required fields'
298
+ }), 400
299
+
300
+ # At this point, we know all values are strings
301
+ assert isinstance(source_text, str)
302
+ assert isinstance(translation_text, str)
303
+ assert isinstance(source_lang, str)
304
+ assert isinstance(target_lang, str)
305
+
306
+ # Evaluate translation using both metrics
307
+ evaluation = evaluate_translation(source_text, translation_text)
308
+
309
+ return jsonify(evaluation)
310
+
311
+ except Exception as e:
312
+ print(f"Error during evaluation: {e}")
313
+ return jsonify({
314
+ 'error': 'An error occurred during evaluation'
315
+ }), 500
316
+
317
+ if __name__ == "__main__":
318
+ # Run the app
319
+ port = int(os.environ.get("PORT", 7860))
320
+ app.run(host="0.0.0.0", port=port)
blaser_sonar_space.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ BLASER 2.0-QE Implementation using sonar-space package
4
+ This implementation should give accurate scores matching the official results
5
+ """
6
+
7
+ import torch
8
+ from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
9
+ from sonar.models.blaser.loader import load_blaser_model
10
+
11
+ class BLASEREvaluator:
12
+ def __init__(self):
13
+ """Initialize BLASER evaluator"""
14
+ self.text_embedder = None
15
+ self.blaser_model = None
16
+ self.initialized = False
17
+
18
+ def initialize(self):
19
+ """Initialize models and pipelines"""
20
+ print("🚀 Initializing BLASER 2.0-QE...")
21
+ print("This may take a few minutes on first run as models are downloaded...")
22
+
23
+ try:
24
+ # Initialize text embedder with SONAR
25
+ print("📝 Loading SONAR text embedder...")
26
+ self.text_embedder = TextToEmbeddingModelPipeline(
27
+ encoder="text_sonar_basic_encoder",
28
+ tokenizer="text_sonar_basic_encoder"
29
+ )
30
+
31
+ # Load BLASER model
32
+ print("🎯 Loading BLASER 2.0-QE model...")
33
+ self.blaser_model = load_blaser_model("blaser_2_0_qe").eval()
34
+
35
+ self.initialized = True
36
+ print("✅ BLASER 2.0-QE initialized successfully!")
37
+ return True
38
+
39
+ except Exception as e:
40
+ print(f"❌ Initialization failed: {e}")
41
+ print("Try setting FAIRSEQ2_EXTENSION_TRACE=1 for more details")
42
+ return False
43
+
44
+ def evaluate(self, source_text: str, translation_text: str,
45
+ source_lang: str = "fra_Latn", target_lang: str = "eng_Latn") -> float:
46
+ """
47
+ Evaluate translation quality using BLASER 2.0-QE
48
+
49
+ Args:
50
+ source_text: Source text
51
+ translation_text: Machine translation
52
+ source_lang: Source language code (default: fra_Latn)
53
+ target_lang: Target language code (default: eng_Latn)
54
+
55
+ Returns:
56
+ BLASER score (higher is better)
57
+ """
58
+ if not self.initialized:
59
+ raise RuntimeError("BLASER not initialized. Call initialize() first.")
60
+
61
+ print(f"\n📊 Evaluating translation:")
62
+ print(f" Source ({source_lang}): {source_text}")
63
+ print(f" Translation ({target_lang}): {translation_text}")
64
+
65
+ # Generate embeddings using SONAR
66
+ print("🔄 Generating embeddings...")
67
+ src_embs = self.text_embedder.predict([source_text], source_lang=source_lang)
68
+ mt_embs = self.text_embedder.predict([translation_text], source_lang=target_lang)
69
+
70
+ # Get BLASER score
71
+ print("🔄 Computing BLASER score...")
72
+ with torch.inference_mode():
73
+ score = self.blaser_model(src=src_embs, mt=mt_embs).item()
74
+
75
+ print(f"✨ BLASER score: {score:.3f}")
76
+ return score
77
+
78
+ def main():
79
+ """Example usage"""
80
+ # Initialize evaluator
81
+ evaluator = BLASEREvaluator()
82
+ if not evaluator.initialize():
83
+ print("Failed to initialize BLASER")
84
+ return
85
+
86
+ # Test cases with both directions
87
+ test_cases = [
88
+ # French-English pair
89
+ {
90
+ "source": "Le chat s'assit sur le tapis.",
91
+ "translation": "The cat sat down on the carpet.",
92
+ "source_lang": "fra_Latn",
93
+ "target_lang": "eng_Latn",
94
+ "name": "French → English"
95
+ },
96
+ {
97
+ "source": "The cat sat down on the carpet.",
98
+ "translation": "Le chat s'assit sur le tapis.",
99
+ "source_lang": "eng_Latn",
100
+ "target_lang": "fra_Latn",
101
+ "name": "English → French"
102
+ },
103
+ # English-English pair
104
+ {
105
+ "source": "The dog is running.",
106
+ "translation": "The dog runs.",
107
+ "source_lang": "eng_Latn",
108
+ "target_lang": "eng_Latn",
109
+ "name": "English → English (present continuous → simple)"
110
+ },
111
+ {
112
+ "source": "The dog runs.",
113
+ "translation": "The dog is running.",
114
+ "source_lang": "eng_Latn",
115
+ "target_lang": "eng_Latn",
116
+ "name": "English → English (simple → present continuous)"
117
+ },
118
+ # Spanish-English pair
119
+ {
120
+ "source": "El gato está sentado en la alfombra.",
121
+ "translation": "The cat is sitting on the carpet.",
122
+ "source_lang": "spa_Latn",
123
+ "target_lang": "eng_Latn",
124
+ "name": "Spanish → English"
125
+ },
126
+ {
127
+ "source": "The cat is sitting on the carpet.",
128
+ "translation": "El gato está sentado en la alfombra.",
129
+ "source_lang": "eng_Latn",
130
+ "target_lang": "spa_Latn",
131
+ "name": "English → Spanish"
132
+ }
133
+ ]
134
+
135
+ print("\n=== Running BLASER evaluations in both directions ===\n")
136
+
137
+ for case in test_cases:
138
+ print(f"\n🔄 Testing: {case['name']}")
139
+ score = evaluator.evaluate(
140
+ case["source"],
141
+ case["translation"],
142
+ case["source_lang"],
143
+ case["target_lang"]
144
+ )
145
+ print(f"📈 Final score: {score:.3f}")
146
+ print(" " + "="*50)
147
+
148
+ if __name__ == "__main__":
149
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ flask>=2.0.0
2
+ unbabel-comet>=2.0.0
3
+ torch>=2.0.0 # Required by COMET
4
+ transformers>=4.0.0 # Required by COMET
5
+ sonar-space # For BLASER evaluation
6
+ python-dotenv
7
+ huggingface-hub
8
+ pytorch-lightning
static/favicon 2.ico ADDED
static/icon.ico ADDED

Git LFS Details

  • SHA256: 97817baf4e42678312576e53bdcff7e30c151332c895774ed314591558de3267
  • Pointer size: 131 Bytes
  • Size of remote file: 248 kB
templates/index.html ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Quality Lens</title>
7
+ <link rel="icon" type="image/x-icon" href="{{ url_for('static', filename='icon.ico') }}">
8
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
9
+ <style>
10
+ body {
11
+ background-color: #f8f9fa;
12
+ padding-top: 2rem;
13
+ }
14
+ .score-display {
15
+ font-size: 2.5rem;
16
+ font-weight: bold;
17
+ color: #0d6efd;
18
+ }
19
+ .loading {
20
+ display: none;
21
+ }
22
+ .card {
23
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
24
+ }
25
+ .language-select {
26
+ max-width: 200px;
27
+ }
28
+ #result {
29
+ transition: all 0.3s ease;
30
+ }
31
+ .score-circle {
32
+ width: 120px;
33
+ height: 120px;
34
+ border-radius: 50%;
35
+ border: 8px solid #e9ecef;
36
+ display: flex;
37
+ flex-direction: column;
38
+ justify-content: center;
39
+ align-items: center;
40
+ margin: 0 auto;
41
+ transition: border-color 0.3s ease;
42
+ }
43
+ .score-circle.excellent {
44
+ border-color: #28a745;
45
+ }
46
+ .score-circle.acceptable {
47
+ border-color: #17a2b8;
48
+ }
49
+ .score-circle.poor {
50
+ border-color: #dc3545;
51
+ }
52
+ .score-label {
53
+ font-size: 0.8rem;
54
+ color: #6c757d;
55
+ }
56
+ .quality-meter {
57
+ padding: 10px;
58
+ }
59
+ .hallucination-types .alert {
60
+ margin-bottom: 0.5rem;
61
+ }
62
+ .action-item {
63
+ padding: 0.5rem 0;
64
+ }
65
+ .quality-level, .confidence {
66
+ font-size: 1.1rem;
67
+ }
68
+ .explanation {
69
+ font-size: 1rem;
70
+ color: #6c757d;
71
+ }
72
+ </style>
73
+ </head>
74
+ <body>
75
+ <div class="container">
76
+ <h1 class="text-center mb-4">Quality Lens</h1>
77
+ <h5 class="text-center text-muted mb-4">Translation QE & Hallucination Detection</h5>
78
+
79
+ <div class="row justify-content-center">
80
+ <div class="col-md-10">
81
+ <div class="card">
82
+ <div class="card-body">
83
+ <form id="evaluationForm" method="POST">
84
+ <div class="row mb-3">
85
+ <!-- Source Text -->
86
+ <div class="col-md-6">
87
+ <div class="form-group">
88
+ <label for="source_text" class="form-label">Source Text:</label>
89
+ <textarea class="form-control" id="source_text" name="source_text" rows="4" required></textarea>
90
+ <div class="mt-2">
91
+ <label for="source_lang" class="form-label">Source Language:</label>
92
+ <select class="form-select language-select" id="source_lang" name="source_lang" required>
93
+ <option value="eng_Latn">English</option>
94
+ <option value="fra_Latn">French</option>
95
+ <option value="spa_Latn">Spanish</option>
96
+ <option value="deu_Latn">German</option>
97
+ <option value="ita_Latn">Italian</option>
98
+ <option value="por_Latn">Portuguese</option>
99
+ <option value="nld_Latn">Dutch</option>
100
+ <option value="zho_Hans">Chinese (Simplified)</option>
101
+ <option value="jpn_Jpan">Japanese</option>
102
+ <option value="kor_Hang">Korean</option>
103
+ </select>
104
+ </div>
105
+ </div>
106
+ </div>
107
+
108
+ <!-- Translation Text -->
109
+ <div class="col-md-6">
110
+ <div class="form-group">
111
+ <label for="translation_text" class="form-label">Translation:</label>
112
+ <textarea class="form-control" id="translation_text" name="translation_text" rows="4" required></textarea>
113
+ <div class="mt-2">
114
+ <label for="target_lang" class="form-label">Target Language:</label>
115
+ <select class="form-select language-select" id="target_lang" name="target_lang" required>
116
+ <option value="eng_Latn">English</option>
117
+ <option value="fra_Latn">French</option>
118
+ <option value="spa_Latn">Spanish</option>
119
+ <option value="deu_Latn">German</option>
120
+ <option value="ita_Latn">Italian</option>
121
+ <option value="por_Latn">Portuguese</option>
122
+ <option value="nld_Latn">Dutch</option>
123
+ <option value="zho_Hans">Chinese (Simplified)</option>
124
+ <option value="jpn_Jpan">Japanese</option>
125
+ <option value="kor_Hang">Korean</option>
126
+ </select>
127
+ </div>
128
+ </div>
129
+ </div>
130
+ </div>
131
+
132
+ <!-- Submit Button -->
133
+ <div class="text-center">
134
+ <button type="submit" class="btn btn-primary btn-lg px-4">
135
+ Evaluate Translation
136
+ </button>
137
+ </div>
138
+ </form>
139
+
140
+ <!-- Loading Spinner -->
141
+ <div class="loading text-center mt-4">
142
+ <div class="spinner-border text-primary" role="status">
143
+ <span class="visually-hidden">Loading...</span>
144
+ </div>
145
+ <p class="mt-2">Evaluating translation quality...</p>
146
+ </div>
147
+
148
+ <!-- Results -->
149
+ <div id="result" class="mt-4" style="display: none;">
150
+ <div class="row">
151
+ <!-- BLASER Score -->
152
+ <div class="col-md-6 mb-3">
153
+ <div class="card h-100">
154
+ <div class="card-body text-center">
155
+ <h5 class="card-title mb-3">BLASER Score</h5>
156
+ <div class="score-display mb-2" id="blaser_score">0.000</div>
157
+ <p class="text-muted">
158
+ BLASER scores range from 1 to 5, where 5 indicates perfect semantic equivalence.
159
+ </p>
160
+ </div>
161
+ </div>
162
+ </div>
163
+
164
+ <!-- COMET Score -->
165
+ <div class="col-md-6 mb-3">
166
+ <div class="card h-100">
167
+ <div class="card-body text-center">
168
+ <h5 class="card-title mb-3">COMET Score</h5>
169
+ <div class="score-display mb-2" id="comet_score">0.000</div>
170
+ <p class="text-muted">
171
+ COMET scores are mapped to match BLASER's 1-5 range.
172
+ </p>
173
+ </div>
174
+ </div>
175
+ </div>
176
+ </div>
177
+
178
+ <!-- Hallucination Detection -->
179
+ <div class="card mt-3">
180
+ <div class="card-body">
181
+ <h5 class="card-title">
182
+ <i class="fas fa-exclamation-triangle"></i>
183
+ Hallucination Detection
184
+ </h5>
185
+ <div class="progress mb-3">
186
+ <div id="hallucination_score_bar" class="progress-bar" role="progressbar" style="width: 0%">
187
+ 0%
188
+ </div>
189
+ </div>
190
+ <p id="hallucination_details" class="mb-0">
191
+ <!-- Will be populated by JS -->
192
+ </p>
193
+ </div>
194
+ </div>
195
+ </div>
196
+ </div>
197
+ </div>
198
+ </div>
199
+ </div>
200
+
201
+ <!-- Acknowledgments -->
202
+ <footer class="mt-5 mb-4">
203
+ <div class="container">
204
+ <hr>
205
+ <div class="row justify-content-center">
206
+ <div class="col-md-8">
207
+ <h6 class="text-center mb-3">Acknowledgments</h6>
208
+ <div class="d-flex flex-wrap justify-content-center gap-4">
209
+ <div class="text-center">
210
+ <a href="https://huggingface.co/facebook/blaser-2.0-qe" target="_blank" class="text-decoration-none">
211
+ <span class="badge bg-light text-dark border">BLASER 2.0</span>
212
+ </a>
213
+ <div class="small text-muted">CC BY-NC 4.0</div>
214
+ </div>
215
+ <div class="text-center">
216
+ <a href="https://github.com/facebookresearch/SONAR" target="_blank" class="text-decoration-none">
217
+ <span class="badge bg-light text-dark border">SONAR</span>
218
+ </a>
219
+ <div class="small text-muted">MIT</div>
220
+ </div>
221
+ <div class="text-center">
222
+ <a href="https://github.com/Unbabel/COMET" target="_blank" class="text-decoration-none">
223
+ <span class="badge bg-light text-dark border">COMET-QE</span>
224
+ </a>
225
+ <div class="small text-muted">CC BY-NC-SA 4.0</div>
226
+ </div>
227
+ <div class="text-center">
228
+ <a href="https://arxiv.org/abs/2501.17295" target="_blank" class="text-decoration-none">
229
+ <span class="badge bg-light text-dark border">Hallucination Detection</span>
230
+ </a>
231
+ <div class="small text-muted">arXiv:2501.17295</div>
232
+ </div>
233
+ </div>
234
+ <p class="small text-muted text-center mt-3">
235
+ For research and non-commercial use only
236
+ </p>
237
+ </div>
238
+ </div>
239
+ </div>
240
+ </footer>
241
+ </div>
242
+
243
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
244
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
245
+ <script>
246
+ function updateAssessment(data) {
247
+ if (!data.success) {
248
+ throw new Error(data.error || 'Unknown error occurred');
249
+ }
250
+
251
+ // Update scores
252
+ document.getElementById('blaser_score').textContent = data.blaser_score.toFixed(3);
253
+ document.getElementById('comet_score').textContent = data.comet_score.toFixed(3);
254
+
255
+ // Show results
256
+ document.getElementById('result').style.display = 'block';
257
+
258
+ // Calculate hallucination score based on BLASER 2.0-QE paper
259
+ // HS(x,y) = 1 - BLASER(x,y)/5
260
+ const hallucinationScore = 1 - (data.blaser_score / 5);
261
+ const riskPercentage = Math.round(hallucinationScore * 100);
262
+
263
+ // Update progress bar
264
+ const riskBar = document.getElementById('hallucination_score_bar');
265
+ riskBar.style.width = `${riskPercentage}%`;
266
+ riskBar.textContent = `${riskPercentage}%`;
267
+
268
+ // Using threshold T=0.4 (equivalent to BLASER score of 3.0)
269
+ // This means HS >= 0.4 indicates hallucination
270
+ if (hallucinationScore >= 0.4) {
271
+ riskBar.className = 'progress-bar bg-danger';
272
+ document.getElementById('hallucination_details').textContent =
273
+ 'High likelihood of hallucination detected (HS ≥ 0.4). The translation may contain fabricated content.';
274
+ } else if (hallucinationScore >= 0.3) {
275
+ riskBar.className = 'progress-bar bg-warning';
276
+ document.getElementById('hallucination_details').textContent =
277
+ 'Moderate risk of semantic divergence (0.3 ≤ HS < 0.4).';
278
+ } else {
279
+ riskBar.className = 'progress-bar bg-info';
280
+ document.getElementById('hallucination_details').textContent =
281
+ 'Low risk of hallucination (HS < 0.3). Translation appears semantically faithful.';
282
+ }
283
+ }
284
+
285
+ document.getElementById('evaluationForm').addEventListener('submit', async function(e) {
286
+ e.preventDefault();
287
+
288
+ // Show loading spinner
289
+ document.querySelector('.loading').style.display = 'block';
290
+ document.getElementById('result').style.display = 'none';
291
+
292
+ // Get form data
293
+ const formData = new FormData(this);
294
+
295
+ try {
296
+ // Send request to backend
297
+ const response = await fetch('/evaluate', {
298
+ method: 'POST',
299
+ body: formData
300
+ });
301
+
302
+ const data = await response.json();
303
+
304
+ if (!data.success) {
305
+ throw new Error(data.error || 'Unknown error occurred');
306
+ }
307
+
308
+ // Update assessment display
309
+ updateAssessment(data);
310
+ } catch (error) {
311
+ console.error('Error:', error);
312
+ alert('An error occurred while evaluating the translation. Please try again.');
313
+ } finally {
314
+ // Hide loading spinner
315
+ document.querySelector('.loading').style.display = 'none';
316
+ }
317
+ });
318
+ </script>
319
+ </body>
320
+ </html>