Spaces:
Sleeping
Sleeping
Tristan Yu
commited on
Upload 7 files
Browse files- .gitattributes +1 -0
- Dockerfile +33 -0
- app.py +320 -0
- blaser_sonar_space.py +149 -0
- requirements.txt +8 -0
- static/favicon 2.ico +0 -0
- static/icon.ico +3 -0
- templates/index.html +320 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
static/icon.ico filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
# Create a non-root user
|
| 4 |
+
RUN useradd -m -u 1000 user
|
| 5 |
+
USER user
|
| 6 |
+
ENV HOME=/home/user \
|
| 7 |
+
PATH=/home/user/.local/bin:$PATH
|
| 8 |
+
|
| 9 |
+
WORKDIR $HOME/app
|
| 10 |
+
|
| 11 |
+
# Install system dependencies
|
| 12 |
+
RUN apt-get update && \
|
| 13 |
+
apt-get install -y --no-install-recommends \
|
| 14 |
+
build-essential \
|
| 15 |
+
git \
|
| 16 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
+
|
| 18 |
+
# Copy requirements first
|
| 19 |
+
COPY --chown=user requirements.txt .
|
| 20 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 21 |
+
|
| 22 |
+
# Copy the rest of the application
|
| 23 |
+
COPY --chown=user . .
|
| 24 |
+
|
| 25 |
+
# Set environment variables
|
| 26 |
+
ENV HF_TOKEN=$HF_TOKEN
|
| 27 |
+
ENV PORT=7860
|
| 28 |
+
|
| 29 |
+
# Make port 7860 available (Hugging Face Spaces default)
|
| 30 |
+
EXPOSE 7860
|
| 31 |
+
|
| 32 |
+
# Command to run the application
|
| 33 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Quality Lens - Translation Quality Estimation and Hallucination Detection
|
| 4 |
+
This script provides a web interface for evaluating translation quality and detecting hallucinations.
|
| 5 |
+
Features:
|
| 6 |
+
- Translation quality assessment
|
| 7 |
+
- Semantic equivalence analysis
|
| 8 |
+
- Hallucination detection
|
| 9 |
+
- COMET-QE integration
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import multiprocessing as mp
|
| 14 |
+
from typing import Optional, Dict, Any, List, cast
|
| 15 |
+
from flask import Flask, render_template, request, jsonify
|
| 16 |
+
from blaser_sonar_space import BLASEREvaluator
|
| 17 |
+
from comet import download_model, load_from_checkpoint
|
| 18 |
+
from comet.models import RegressionMetric
|
| 19 |
+
from dotenv import load_dotenv
|
| 20 |
+
from huggingface_hub import login
|
| 21 |
+
import pytorch_lightning as pl
|
| 22 |
+
import torch
|
| 23 |
+
import traceback
|
| 24 |
+
import sys
|
| 25 |
+
|
| 26 |
+
# Load environment variables
|
| 27 |
+
load_dotenv()
|
| 28 |
+
|
| 29 |
+
# Set tokenizer parallelism to avoid deadlocks
|
| 30 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 31 |
+
|
| 32 |
+
# Set multiprocessing start method to 'fork' on Unix systems
|
| 33 |
+
if os.name != 'nt': # Not Windows
|
| 34 |
+
mp.set_start_method('fork', force=True)
|
| 35 |
+
|
| 36 |
+
# Login to Hugging Face Hub if token is available
|
| 37 |
+
if os.getenv("HUGGINGFACE_TOKEN"):
|
| 38 |
+
login(token=os.getenv("HUGGINGFACE_TOKEN"))
|
| 39 |
+
|
| 40 |
+
app = Flask(__name__)
|
| 41 |
+
|
| 42 |
+
class COMETEvaluator:
|
| 43 |
+
"""COMET-QE evaluator using the wmt22-comet-da model."""
|
| 44 |
+
|
| 45 |
+
def __init__(self):
|
| 46 |
+
self.model = None
|
| 47 |
+
self.model_name = "Unbabel/wmt22-comet-da" # Using the latest recommended model
|
| 48 |
+
|
| 49 |
+
def initialize(self) -> bool:
|
| 50 |
+
"""Initialize the COMET model."""
|
| 51 |
+
try:
|
| 52 |
+
print(f"🔄 Downloading COMET model {self.model_name}...")
|
| 53 |
+
model_path = download_model(self.model_name)
|
| 54 |
+
print("✨ Loading COMET model...")
|
| 55 |
+
|
| 56 |
+
# Load and initialize the model
|
| 57 |
+
self.model = cast(RegressionMetric, load_from_checkpoint(model_path))
|
| 58 |
+
|
| 59 |
+
# Set model to evaluation mode
|
| 60 |
+
self.model.eval()
|
| 61 |
+
|
| 62 |
+
# Create a trainer with simplified configuration
|
| 63 |
+
self.model.trainer = pl.Trainer(
|
| 64 |
+
accelerator="mps" if torch.backends.mps.is_available() else "cpu",
|
| 65 |
+
devices=1,
|
| 66 |
+
enable_progress_bar=True, # Enable to see progress
|
| 67 |
+
logger=False,
|
| 68 |
+
strategy="auto"
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Move model to appropriate device
|
| 72 |
+
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
| 73 |
+
print(f"🖥️ Using device: {device}")
|
| 74 |
+
self.model = self.model.to(device)
|
| 75 |
+
|
| 76 |
+
return True
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"❌ Failed to initialize COMET model: {str(e)}")
|
| 79 |
+
traceback.print_exc() # Print full traceback
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
def evaluate(self, source: str, hypothesis: str, reference: str) -> Optional[Dict[str, Any]]:
|
| 83 |
+
"""
|
| 84 |
+
Evaluate translation quality using COMET.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
source: Source text
|
| 88 |
+
hypothesis: Translation to evaluate
|
| 89 |
+
reference: Reference translation
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
Dictionary containing evaluation results or None if evaluation fails
|
| 93 |
+
"""
|
| 94 |
+
if self.model is None:
|
| 95 |
+
print("❌ COMET model not initialized")
|
| 96 |
+
return None
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
print("📊 Starting COMET evaluation...")
|
| 100 |
+
|
| 101 |
+
# Prepare data in COMET format
|
| 102 |
+
data = [{
|
| 103 |
+
"src": source,
|
| 104 |
+
"mt": hypothesis,
|
| 105 |
+
"ref": reference
|
| 106 |
+
}]
|
| 107 |
+
|
| 108 |
+
print("🔄 Data prepared, running prediction...")
|
| 109 |
+
|
| 110 |
+
# Set environment variables for multiprocessing
|
| 111 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 112 |
+
|
| 113 |
+
# Call predict method with minimal configuration
|
| 114 |
+
model_output = self.model.predict(
|
| 115 |
+
samples=data,
|
| 116 |
+
batch_size=1,
|
| 117 |
+
num_workers=1 # Use 1 worker to enable multiprocessing
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
print(f"✨ Raw model output: {model_output}")
|
| 121 |
+
|
| 122 |
+
# Get the system score from the output
|
| 123 |
+
if isinstance(model_output, dict):
|
| 124 |
+
comet_score = float(model_output.get('system_score', 0.0))
|
| 125 |
+
else:
|
| 126 |
+
# Try to get the first score if it's a list/tuple
|
| 127 |
+
try:
|
| 128 |
+
comet_score = float(model_output[0])
|
| 129 |
+
except (IndexError, TypeError, ValueError) as e:
|
| 130 |
+
print(f"❌ Could not extract score from COMET output: {str(e)}")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
print(f"📈 COMET score: {comet_score}")
|
| 134 |
+
|
| 135 |
+
# Map COMET score to BLASER range (1-5)
|
| 136 |
+
mapped_score = 1 + 4 / (1 + torch.exp(torch.tensor(-comet_score)))
|
| 137 |
+
|
| 138 |
+
result = {
|
| 139 |
+
"comet_score": comet_score,
|
| 140 |
+
"mapped_score": float(mapped_score),
|
| 141 |
+
"quality_assessment": "Good" if mapped_score >= 3.5 else "Poor"
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
print(f"✅ Evaluation complete: {result}")
|
| 145 |
+
return result
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"❌ COMET evaluation failed: {str(e)}")
|
| 149 |
+
traceback.print_exc() # Print full traceback
|
| 150 |
+
return None
|
| 151 |
+
|
| 152 |
+
# Initialize evaluators
|
| 153 |
+
print("🚀 Initializing Quality Lens components...")
|
| 154 |
+
blaser_evaluator = BLASEREvaluator()
|
| 155 |
+
if not blaser_evaluator.initialize():
|
| 156 |
+
raise RuntimeError("Failed to initialize BLASER evaluator")
|
| 157 |
+
print("✅ BLASER initialized successfully!")
|
| 158 |
+
|
| 159 |
+
print("🚀 Initializing COMET-QE...")
|
| 160 |
+
comet_evaluator = COMETEvaluator()
|
| 161 |
+
if not comet_evaluator.initialize():
|
| 162 |
+
raise RuntimeError("Failed to initialize COMET evaluator")
|
| 163 |
+
print("✅ COMET-QE initialized successfully!")
|
| 164 |
+
|
| 165 |
+
def evaluate_translation(source_text: str, translation_text: str) -> Dict[str, Any]:
|
| 166 |
+
"""Evaluate translation quality using BLASER and COMET."""
|
| 167 |
+
try:
|
| 168 |
+
print("📊 Evaluating translation...")
|
| 169 |
+
|
| 170 |
+
# Get BLASER score
|
| 171 |
+
blaser_score = blaser_evaluator.evaluate(source_text, translation_text)
|
| 172 |
+
|
| 173 |
+
# Get COMET score
|
| 174 |
+
comet_result = comet_evaluator.evaluate(
|
| 175 |
+
source_text,
|
| 176 |
+
translation_text,
|
| 177 |
+
"" # Reference translation is not available in this context
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
if comet_result is None:
|
| 181 |
+
raise Exception("COMET evaluation failed")
|
| 182 |
+
|
| 183 |
+
# Extract scores
|
| 184 |
+
comet_score = comet_result["comet_score"]
|
| 185 |
+
comet_mapped_score = comet_result["mapped_score"]
|
| 186 |
+
|
| 187 |
+
# Get quality assessment
|
| 188 |
+
quality = get_quality_assessment(
|
| 189 |
+
blaser_score=blaser_score,
|
| 190 |
+
comet_score=comet_mapped_score
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
return {
|
| 194 |
+
"success": True,
|
| 195 |
+
"blaser_score": round(blaser_score, 3),
|
| 196 |
+
"comet_score": round(comet_mapped_score, 3),
|
| 197 |
+
"raw_comet_score": round(comet_score, 3),
|
| 198 |
+
"quality_assessment": quality
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
except Exception as e:
|
| 202 |
+
print(f"❌ Translation evaluation failed: {e}")
|
| 203 |
+
return {
|
| 204 |
+
"success": False,
|
| 205 |
+
"error": str(e)
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
def get_quality_assessment(blaser_score: float, comet_score: Optional[float] = None) -> Dict[str, Any]:
|
| 209 |
+
"""Get quality assessment based on BLASER and COMET scores."""
|
| 210 |
+
|
| 211 |
+
# Map BLASER score to quality level
|
| 212 |
+
if blaser_score >= 4.5:
|
| 213 |
+
blaser_quality = "Excellent"
|
| 214 |
+
elif blaser_score >= 4.0:
|
| 215 |
+
blaser_quality = "Very Good"
|
| 216 |
+
elif blaser_score >= 3.5:
|
| 217 |
+
blaser_quality = "Good"
|
| 218 |
+
elif blaser_score >= 3.0:
|
| 219 |
+
blaser_quality = "Fair"
|
| 220 |
+
else:
|
| 221 |
+
blaser_quality = "Poor"
|
| 222 |
+
|
| 223 |
+
# If COMET score is available, map it to quality level
|
| 224 |
+
comet_quality = None
|
| 225 |
+
if comet_score is not None:
|
| 226 |
+
if comet_score >= 4.5:
|
| 227 |
+
comet_quality = "Excellent"
|
| 228 |
+
elif comet_score >= 4.0:
|
| 229 |
+
comet_quality = "Very Good"
|
| 230 |
+
elif comet_score >= 3.5:
|
| 231 |
+
comet_quality = "Good"
|
| 232 |
+
elif comet_score >= 3.0:
|
| 233 |
+
comet_quality = "Fair"
|
| 234 |
+
else:
|
| 235 |
+
comet_quality = "Poor"
|
| 236 |
+
|
| 237 |
+
# Determine confidence level based on multiple factors
|
| 238 |
+
confidence = "High"
|
| 239 |
+
if comet_quality and comet_score is not None:
|
| 240 |
+
score_diff = abs(blaser_score - comet_score)
|
| 241 |
+
avg_score = (blaser_score + comet_score) / 2
|
| 242 |
+
|
| 243 |
+
# Very high confidence when scores agree and are in good ranges
|
| 244 |
+
if score_diff < 0.2 and avg_score >= 4.0:
|
| 245 |
+
confidence = "Very High"
|
| 246 |
+
# High confidence when scores are similar and acceptable
|
| 247 |
+
elif score_diff < 0.3 and avg_score >= 3.5:
|
| 248 |
+
confidence = "High"
|
| 249 |
+
# Medium confidence when scores differ moderately or are in mediocre range
|
| 250 |
+
elif score_diff < 0.5 or avg_score >= 3.0:
|
| 251 |
+
confidence = "Medium"
|
| 252 |
+
# Low confidence when scores differ significantly or are poor
|
| 253 |
+
else:
|
| 254 |
+
confidence = "Low"
|
| 255 |
+
|
| 256 |
+
return {
|
| 257 |
+
"quality_level": blaser_quality,
|
| 258 |
+
"comet_quality_level": comet_quality,
|
| 259 |
+
"confidence": confidence,
|
| 260 |
+
"explanation": get_quality_explanation(blaser_quality, comet_quality, confidence)
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
def get_quality_explanation(blaser_quality: str, comet_quality: Optional[str], confidence: str) -> str:
|
| 264 |
+
"""Generate explanation for quality assessment."""
|
| 265 |
+
if not comet_quality:
|
| 266 |
+
return f"BLASER rates this translation as {blaser_quality}."
|
| 267 |
+
|
| 268 |
+
if blaser_quality == comet_quality:
|
| 269 |
+
if confidence == "Very High":
|
| 270 |
+
return f"Both BLASER and COMET strongly agree that this translation is {blaser_quality}."
|
| 271 |
+
else:
|
| 272 |
+
return f"Both BLASER and COMET agree that this translation is {blaser_quality}."
|
| 273 |
+
|
| 274 |
+
if confidence == "Low":
|
| 275 |
+
return f"There is significant disagreement between metrics: BLASER rates it as {blaser_quality} while COMET rates it as {comet_quality}."
|
| 276 |
+
|
| 277 |
+
return f"BLASER rates this translation as {blaser_quality}, while COMET rates it as {comet_quality}. Consider reviewing for potential issues."
|
| 278 |
+
|
| 279 |
+
@app.route('/')
|
| 280 |
+
def index():
|
| 281 |
+
"""Render the main page"""
|
| 282 |
+
return render_template('index.html')
|
| 283 |
+
|
| 284 |
+
@app.route('/evaluate', methods=['POST'])
|
| 285 |
+
def evaluate():
|
| 286 |
+
"""Handle translation evaluation requests"""
|
| 287 |
+
try:
|
| 288 |
+
# Get form data
|
| 289 |
+
source_text: Optional[str] = request.form.get('source_text')
|
| 290 |
+
translation_text: Optional[str] = request.form.get('translation_text')
|
| 291 |
+
source_lang: Optional[str] = request.form.get('source_lang')
|
| 292 |
+
target_lang: Optional[str] = request.form.get('target_lang')
|
| 293 |
+
|
| 294 |
+
# Validate input
|
| 295 |
+
if not all([source_text, translation_text, source_lang, target_lang]):
|
| 296 |
+
return jsonify({
|
| 297 |
+
'error': 'Missing required fields'
|
| 298 |
+
}), 400
|
| 299 |
+
|
| 300 |
+
# At this point, we know all values are strings
|
| 301 |
+
assert isinstance(source_text, str)
|
| 302 |
+
assert isinstance(translation_text, str)
|
| 303 |
+
assert isinstance(source_lang, str)
|
| 304 |
+
assert isinstance(target_lang, str)
|
| 305 |
+
|
| 306 |
+
# Evaluate translation using both metrics
|
| 307 |
+
evaluation = evaluate_translation(source_text, translation_text)
|
| 308 |
+
|
| 309 |
+
return jsonify(evaluation)
|
| 310 |
+
|
| 311 |
+
except Exception as e:
|
| 312 |
+
print(f"Error during evaluation: {e}")
|
| 313 |
+
return jsonify({
|
| 314 |
+
'error': 'An error occurred during evaluation'
|
| 315 |
+
}), 500
|
| 316 |
+
|
| 317 |
+
if __name__ == "__main__":
|
| 318 |
+
# Run the app
|
| 319 |
+
port = int(os.environ.get("PORT", 7860))
|
| 320 |
+
app.run(host="0.0.0.0", port=port)
|
blaser_sonar_space.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
BLASER 2.0-QE Implementation using sonar-space package
|
| 4 |
+
This implementation should give accurate scores matching the official results
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
|
| 9 |
+
from sonar.models.blaser.loader import load_blaser_model
|
| 10 |
+
|
| 11 |
+
class BLASEREvaluator:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
"""Initialize BLASER evaluator"""
|
| 14 |
+
self.text_embedder = None
|
| 15 |
+
self.blaser_model = None
|
| 16 |
+
self.initialized = False
|
| 17 |
+
|
| 18 |
+
def initialize(self):
|
| 19 |
+
"""Initialize models and pipelines"""
|
| 20 |
+
print("🚀 Initializing BLASER 2.0-QE...")
|
| 21 |
+
print("This may take a few minutes on first run as models are downloaded...")
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
# Initialize text embedder with SONAR
|
| 25 |
+
print("📝 Loading SONAR text embedder...")
|
| 26 |
+
self.text_embedder = TextToEmbeddingModelPipeline(
|
| 27 |
+
encoder="text_sonar_basic_encoder",
|
| 28 |
+
tokenizer="text_sonar_basic_encoder"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Load BLASER model
|
| 32 |
+
print("🎯 Loading BLASER 2.0-QE model...")
|
| 33 |
+
self.blaser_model = load_blaser_model("blaser_2_0_qe").eval()
|
| 34 |
+
|
| 35 |
+
self.initialized = True
|
| 36 |
+
print("✅ BLASER 2.0-QE initialized successfully!")
|
| 37 |
+
return True
|
| 38 |
+
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"❌ Initialization failed: {e}")
|
| 41 |
+
print("Try setting FAIRSEQ2_EXTENSION_TRACE=1 for more details")
|
| 42 |
+
return False
|
| 43 |
+
|
| 44 |
+
def evaluate(self, source_text: str, translation_text: str,
|
| 45 |
+
source_lang: str = "fra_Latn", target_lang: str = "eng_Latn") -> float:
|
| 46 |
+
"""
|
| 47 |
+
Evaluate translation quality using BLASER 2.0-QE
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
source_text: Source text
|
| 51 |
+
translation_text: Machine translation
|
| 52 |
+
source_lang: Source language code (default: fra_Latn)
|
| 53 |
+
target_lang: Target language code (default: eng_Latn)
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
BLASER score (higher is better)
|
| 57 |
+
"""
|
| 58 |
+
if not self.initialized:
|
| 59 |
+
raise RuntimeError("BLASER not initialized. Call initialize() first.")
|
| 60 |
+
|
| 61 |
+
print(f"\n📊 Evaluating translation:")
|
| 62 |
+
print(f" Source ({source_lang}): {source_text}")
|
| 63 |
+
print(f" Translation ({target_lang}): {translation_text}")
|
| 64 |
+
|
| 65 |
+
# Generate embeddings using SONAR
|
| 66 |
+
print("🔄 Generating embeddings...")
|
| 67 |
+
src_embs = self.text_embedder.predict([source_text], source_lang=source_lang)
|
| 68 |
+
mt_embs = self.text_embedder.predict([translation_text], source_lang=target_lang)
|
| 69 |
+
|
| 70 |
+
# Get BLASER score
|
| 71 |
+
print("🔄 Computing BLASER score...")
|
| 72 |
+
with torch.inference_mode():
|
| 73 |
+
score = self.blaser_model(src=src_embs, mt=mt_embs).item()
|
| 74 |
+
|
| 75 |
+
print(f"✨ BLASER score: {score:.3f}")
|
| 76 |
+
return score
|
| 77 |
+
|
| 78 |
+
def main():
|
| 79 |
+
"""Example usage"""
|
| 80 |
+
# Initialize evaluator
|
| 81 |
+
evaluator = BLASEREvaluator()
|
| 82 |
+
if not evaluator.initialize():
|
| 83 |
+
print("Failed to initialize BLASER")
|
| 84 |
+
return
|
| 85 |
+
|
| 86 |
+
# Test cases with both directions
|
| 87 |
+
test_cases = [
|
| 88 |
+
# French-English pair
|
| 89 |
+
{
|
| 90 |
+
"source": "Le chat s'assit sur le tapis.",
|
| 91 |
+
"translation": "The cat sat down on the carpet.",
|
| 92 |
+
"source_lang": "fra_Latn",
|
| 93 |
+
"target_lang": "eng_Latn",
|
| 94 |
+
"name": "French → English"
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"source": "The cat sat down on the carpet.",
|
| 98 |
+
"translation": "Le chat s'assit sur le tapis.",
|
| 99 |
+
"source_lang": "eng_Latn",
|
| 100 |
+
"target_lang": "fra_Latn",
|
| 101 |
+
"name": "English → French"
|
| 102 |
+
},
|
| 103 |
+
# English-English pair
|
| 104 |
+
{
|
| 105 |
+
"source": "The dog is running.",
|
| 106 |
+
"translation": "The dog runs.",
|
| 107 |
+
"source_lang": "eng_Latn",
|
| 108 |
+
"target_lang": "eng_Latn",
|
| 109 |
+
"name": "English → English (present continuous → simple)"
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"source": "The dog runs.",
|
| 113 |
+
"translation": "The dog is running.",
|
| 114 |
+
"source_lang": "eng_Latn",
|
| 115 |
+
"target_lang": "eng_Latn",
|
| 116 |
+
"name": "English → English (simple → present continuous)"
|
| 117 |
+
},
|
| 118 |
+
# Spanish-English pair
|
| 119 |
+
{
|
| 120 |
+
"source": "El gato está sentado en la alfombra.",
|
| 121 |
+
"translation": "The cat is sitting on the carpet.",
|
| 122 |
+
"source_lang": "spa_Latn",
|
| 123 |
+
"target_lang": "eng_Latn",
|
| 124 |
+
"name": "Spanish → English"
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"source": "The cat is sitting on the carpet.",
|
| 128 |
+
"translation": "El gato está sentado en la alfombra.",
|
| 129 |
+
"source_lang": "eng_Latn",
|
| 130 |
+
"target_lang": "spa_Latn",
|
| 131 |
+
"name": "English → Spanish"
|
| 132 |
+
}
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
print("\n=== Running BLASER evaluations in both directions ===\n")
|
| 136 |
+
|
| 137 |
+
for case in test_cases:
|
| 138 |
+
print(f"\n🔄 Testing: {case['name']}")
|
| 139 |
+
score = evaluator.evaluate(
|
| 140 |
+
case["source"],
|
| 141 |
+
case["translation"],
|
| 142 |
+
case["source_lang"],
|
| 143 |
+
case["target_lang"]
|
| 144 |
+
)
|
| 145 |
+
print(f"📈 Final score: {score:.3f}")
|
| 146 |
+
print(" " + "="*50)
|
| 147 |
+
|
| 148 |
+
if __name__ == "__main__":
|
| 149 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask>=2.0.0
|
| 2 |
+
unbabel-comet>=2.0.0
|
| 3 |
+
torch>=2.0.0 # Required by COMET
|
| 4 |
+
transformers>=4.0.0 # Required by COMET
|
| 5 |
+
sonar-space # For BLASER evaluation
|
| 6 |
+
python-dotenv
|
| 7 |
+
huggingface-hub
|
| 8 |
+
pytorch-lightning
|
static/favicon 2.ico
ADDED
|
|
static/icon.ico
ADDED
|
|
Git LFS Details
|
templates/index.html
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Quality Lens</title>
|
| 7 |
+
<link rel="icon" type="image/x-icon" href="{{ url_for('static', filename='icon.ico') }}">
|
| 8 |
+
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
|
| 9 |
+
<style>
|
| 10 |
+
body {
|
| 11 |
+
background-color: #f8f9fa;
|
| 12 |
+
padding-top: 2rem;
|
| 13 |
+
}
|
| 14 |
+
.score-display {
|
| 15 |
+
font-size: 2.5rem;
|
| 16 |
+
font-weight: bold;
|
| 17 |
+
color: #0d6efd;
|
| 18 |
+
}
|
| 19 |
+
.loading {
|
| 20 |
+
display: none;
|
| 21 |
+
}
|
| 22 |
+
.card {
|
| 23 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
| 24 |
+
}
|
| 25 |
+
.language-select {
|
| 26 |
+
max-width: 200px;
|
| 27 |
+
}
|
| 28 |
+
#result {
|
| 29 |
+
transition: all 0.3s ease;
|
| 30 |
+
}
|
| 31 |
+
.score-circle {
|
| 32 |
+
width: 120px;
|
| 33 |
+
height: 120px;
|
| 34 |
+
border-radius: 50%;
|
| 35 |
+
border: 8px solid #e9ecef;
|
| 36 |
+
display: flex;
|
| 37 |
+
flex-direction: column;
|
| 38 |
+
justify-content: center;
|
| 39 |
+
align-items: center;
|
| 40 |
+
margin: 0 auto;
|
| 41 |
+
transition: border-color 0.3s ease;
|
| 42 |
+
}
|
| 43 |
+
.score-circle.excellent {
|
| 44 |
+
border-color: #28a745;
|
| 45 |
+
}
|
| 46 |
+
.score-circle.acceptable {
|
| 47 |
+
border-color: #17a2b8;
|
| 48 |
+
}
|
| 49 |
+
.score-circle.poor {
|
| 50 |
+
border-color: #dc3545;
|
| 51 |
+
}
|
| 52 |
+
.score-label {
|
| 53 |
+
font-size: 0.8rem;
|
| 54 |
+
color: #6c757d;
|
| 55 |
+
}
|
| 56 |
+
.quality-meter {
|
| 57 |
+
padding: 10px;
|
| 58 |
+
}
|
| 59 |
+
.hallucination-types .alert {
|
| 60 |
+
margin-bottom: 0.5rem;
|
| 61 |
+
}
|
| 62 |
+
.action-item {
|
| 63 |
+
padding: 0.5rem 0;
|
| 64 |
+
}
|
| 65 |
+
.quality-level, .confidence {
|
| 66 |
+
font-size: 1.1rem;
|
| 67 |
+
}
|
| 68 |
+
.explanation {
|
| 69 |
+
font-size: 1rem;
|
| 70 |
+
color: #6c757d;
|
| 71 |
+
}
|
| 72 |
+
</style>
|
| 73 |
+
</head>
|
| 74 |
+
<body>
|
| 75 |
+
<div class="container">
|
| 76 |
+
<h1 class="text-center mb-4">Quality Lens</h1>
|
| 77 |
+
<h5 class="text-center text-muted mb-4">Translation QE & Hallucination Detection</h5>
|
| 78 |
+
|
| 79 |
+
<div class="row justify-content-center">
|
| 80 |
+
<div class="col-md-10">
|
| 81 |
+
<div class="card">
|
| 82 |
+
<div class="card-body">
|
| 83 |
+
<form id="evaluationForm" method="POST">
|
| 84 |
+
<div class="row mb-3">
|
| 85 |
+
<!-- Source Text -->
|
| 86 |
+
<div class="col-md-6">
|
| 87 |
+
<div class="form-group">
|
| 88 |
+
<label for="source_text" class="form-label">Source Text:</label>
|
| 89 |
+
<textarea class="form-control" id="source_text" name="source_text" rows="4" required></textarea>
|
| 90 |
+
<div class="mt-2">
|
| 91 |
+
<label for="source_lang" class="form-label">Source Language:</label>
|
| 92 |
+
<select class="form-select language-select" id="source_lang" name="source_lang" required>
|
| 93 |
+
<option value="eng_Latn">English</option>
|
| 94 |
+
<option value="fra_Latn">French</option>
|
| 95 |
+
<option value="spa_Latn">Spanish</option>
|
| 96 |
+
<option value="deu_Latn">German</option>
|
| 97 |
+
<option value="ita_Latn">Italian</option>
|
| 98 |
+
<option value="por_Latn">Portuguese</option>
|
| 99 |
+
<option value="nld_Latn">Dutch</option>
|
| 100 |
+
<option value="zho_Hans">Chinese (Simplified)</option>
|
| 101 |
+
<option value="jpn_Jpan">Japanese</option>
|
| 102 |
+
<option value="kor_Hang">Korean</option>
|
| 103 |
+
</select>
|
| 104 |
+
</div>
|
| 105 |
+
</div>
|
| 106 |
+
</div>
|
| 107 |
+
|
| 108 |
+
<!-- Translation Text -->
|
| 109 |
+
<div class="col-md-6">
|
| 110 |
+
<div class="form-group">
|
| 111 |
+
<label for="translation_text" class="form-label">Translation:</label>
|
| 112 |
+
<textarea class="form-control" id="translation_text" name="translation_text" rows="4" required></textarea>
|
| 113 |
+
<div class="mt-2">
|
| 114 |
+
<label for="target_lang" class="form-label">Target Language:</label>
|
| 115 |
+
<select class="form-select language-select" id="target_lang" name="target_lang" required>
|
| 116 |
+
<option value="eng_Latn">English</option>
|
| 117 |
+
<option value="fra_Latn">French</option>
|
| 118 |
+
<option value="spa_Latn">Spanish</option>
|
| 119 |
+
<option value="deu_Latn">German</option>
|
| 120 |
+
<option value="ita_Latn">Italian</option>
|
| 121 |
+
<option value="por_Latn">Portuguese</option>
|
| 122 |
+
<option value="nld_Latn">Dutch</option>
|
| 123 |
+
<option value="zho_Hans">Chinese (Simplified)</option>
|
| 124 |
+
<option value="jpn_Jpan">Japanese</option>
|
| 125 |
+
<option value="kor_Hang">Korean</option>
|
| 126 |
+
</select>
|
| 127 |
+
</div>
|
| 128 |
+
</div>
|
| 129 |
+
</div>
|
| 130 |
+
</div>
|
| 131 |
+
|
| 132 |
+
<!-- Submit Button -->
|
| 133 |
+
<div class="text-center">
|
| 134 |
+
<button type="submit" class="btn btn-primary btn-lg px-4">
|
| 135 |
+
Evaluate Translation
|
| 136 |
+
</button>
|
| 137 |
+
</div>
|
| 138 |
+
</form>
|
| 139 |
+
|
| 140 |
+
<!-- Loading Spinner -->
|
| 141 |
+
<div class="loading text-center mt-4">
|
| 142 |
+
<div class="spinner-border text-primary" role="status">
|
| 143 |
+
<span class="visually-hidden">Loading...</span>
|
| 144 |
+
</div>
|
| 145 |
+
<p class="mt-2">Evaluating translation quality...</p>
|
| 146 |
+
</div>
|
| 147 |
+
|
| 148 |
+
<!-- Results -->
|
| 149 |
+
<div id="result" class="mt-4" style="display: none;">
|
| 150 |
+
<div class="row">
|
| 151 |
+
<!-- BLASER Score -->
|
| 152 |
+
<div class="col-md-6 mb-3">
|
| 153 |
+
<div class="card h-100">
|
| 154 |
+
<div class="card-body text-center">
|
| 155 |
+
<h5 class="card-title mb-3">BLASER Score</h5>
|
| 156 |
+
<div class="score-display mb-2" id="blaser_score">0.000</div>
|
| 157 |
+
<p class="text-muted">
|
| 158 |
+
BLASER scores range from 1 to 5, where 5 indicates perfect semantic equivalence.
|
| 159 |
+
</p>
|
| 160 |
+
</div>
|
| 161 |
+
</div>
|
| 162 |
+
</div>
|
| 163 |
+
|
| 164 |
+
<!-- COMET Score -->
|
| 165 |
+
<div class="col-md-6 mb-3">
|
| 166 |
+
<div class="card h-100">
|
| 167 |
+
<div class="card-body text-center">
|
| 168 |
+
<h5 class="card-title mb-3">COMET Score</h5>
|
| 169 |
+
<div class="score-display mb-2" id="comet_score">0.000</div>
|
| 170 |
+
<p class="text-muted">
|
| 171 |
+
COMET scores are mapped to match BLASER's 1-5 range.
|
| 172 |
+
</p>
|
| 173 |
+
</div>
|
| 174 |
+
</div>
|
| 175 |
+
</div>
|
| 176 |
+
</div>
|
| 177 |
+
|
| 178 |
+
<!-- Hallucination Detection -->
|
| 179 |
+
<div class="card mt-3">
|
| 180 |
+
<div class="card-body">
|
| 181 |
+
<h5 class="card-title">
|
| 182 |
+
<i class="fas fa-exclamation-triangle"></i>
|
| 183 |
+
Hallucination Detection
|
| 184 |
+
</h5>
|
| 185 |
+
<div class="progress mb-3">
|
| 186 |
+
<div id="hallucination_score_bar" class="progress-bar" role="progressbar" style="width: 0%">
|
| 187 |
+
0%
|
| 188 |
+
</div>
|
| 189 |
+
</div>
|
| 190 |
+
<p id="hallucination_details" class="mb-0">
|
| 191 |
+
<!-- Will be populated by JS -->
|
| 192 |
+
</p>
|
| 193 |
+
</div>
|
| 194 |
+
</div>
|
| 195 |
+
</div>
|
| 196 |
+
</div>
|
| 197 |
+
</div>
|
| 198 |
+
</div>
|
| 199 |
+
</div>
|
| 200 |
+
|
| 201 |
+
<!-- Acknowledgments -->
|
| 202 |
+
<footer class="mt-5 mb-4">
|
| 203 |
+
<div class="container">
|
| 204 |
+
<hr>
|
| 205 |
+
<div class="row justify-content-center">
|
| 206 |
+
<div class="col-md-8">
|
| 207 |
+
<h6 class="text-center mb-3">Acknowledgments</h6>
|
| 208 |
+
<div class="d-flex flex-wrap justify-content-center gap-4">
|
| 209 |
+
<div class="text-center">
|
| 210 |
+
<a href="https://huggingface.co/facebook/blaser-2.0-qe" target="_blank" class="text-decoration-none">
|
| 211 |
+
<span class="badge bg-light text-dark border">BLASER 2.0</span>
|
| 212 |
+
</a>
|
| 213 |
+
<div class="small text-muted">CC BY-NC 4.0</div>
|
| 214 |
+
</div>
|
| 215 |
+
<div class="text-center">
|
| 216 |
+
<a href="https://github.com/facebookresearch/SONAR" target="_blank" class="text-decoration-none">
|
| 217 |
+
<span class="badge bg-light text-dark border">SONAR</span>
|
| 218 |
+
</a>
|
| 219 |
+
<div class="small text-muted">MIT</div>
|
| 220 |
+
</div>
|
| 221 |
+
<div class="text-center">
|
| 222 |
+
<a href="https://github.com/Unbabel/COMET" target="_blank" class="text-decoration-none">
|
| 223 |
+
<span class="badge bg-light text-dark border">COMET-QE</span>
|
| 224 |
+
</a>
|
| 225 |
+
<div class="small text-muted">CC BY-NC-SA 4.0</div>
|
| 226 |
+
</div>
|
| 227 |
+
<div class="text-center">
|
| 228 |
+
<a href="https://arxiv.org/abs/2501.17295" target="_blank" class="text-decoration-none">
|
| 229 |
+
<span class="badge bg-light text-dark border">Hallucination Detection</span>
|
| 230 |
+
</a>
|
| 231 |
+
<div class="small text-muted">arXiv:2501.17295</div>
|
| 232 |
+
</div>
|
| 233 |
+
</div>
|
| 234 |
+
<p class="small text-muted text-center mt-3">
|
| 235 |
+
For research and non-commercial use only
|
| 236 |
+
</p>
|
| 237 |
+
</div>
|
| 238 |
+
</div>
|
| 239 |
+
</div>
|
| 240 |
+
</footer>
|
| 241 |
+
</div>
|
| 242 |
+
|
| 243 |
+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
|
| 244 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
|
| 245 |
+
<script>
|
| 246 |
+
function updateAssessment(data) {
|
| 247 |
+
if (!data.success) {
|
| 248 |
+
throw new Error(data.error || 'Unknown error occurred');
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
// Update scores
|
| 252 |
+
document.getElementById('blaser_score').textContent = data.blaser_score.toFixed(3);
|
| 253 |
+
document.getElementById('comet_score').textContent = data.comet_score.toFixed(3);
|
| 254 |
+
|
| 255 |
+
// Show results
|
| 256 |
+
document.getElementById('result').style.display = 'block';
|
| 257 |
+
|
| 258 |
+
// Calculate hallucination score based on BLASER 2.0-QE paper
|
| 259 |
+
// HS(x,y) = 1 - BLASER(x,y)/5
|
| 260 |
+
const hallucinationScore = 1 - (data.blaser_score / 5);
|
| 261 |
+
const riskPercentage = Math.round(hallucinationScore * 100);
|
| 262 |
+
|
| 263 |
+
// Update progress bar
|
| 264 |
+
const riskBar = document.getElementById('hallucination_score_bar');
|
| 265 |
+
riskBar.style.width = `${riskPercentage}%`;
|
| 266 |
+
riskBar.textContent = `${riskPercentage}%`;
|
| 267 |
+
|
| 268 |
+
// Using threshold T=0.4 (equivalent to BLASER score of 3.0)
|
| 269 |
+
// This means HS >= 0.4 indicates hallucination
|
| 270 |
+
if (hallucinationScore >= 0.4) {
|
| 271 |
+
riskBar.className = 'progress-bar bg-danger';
|
| 272 |
+
document.getElementById('hallucination_details').textContent =
|
| 273 |
+
'High likelihood of hallucination detected (HS ≥ 0.4). The translation may contain fabricated content.';
|
| 274 |
+
} else if (hallucinationScore >= 0.3) {
|
| 275 |
+
riskBar.className = 'progress-bar bg-warning';
|
| 276 |
+
document.getElementById('hallucination_details').textContent =
|
| 277 |
+
'Moderate risk of semantic divergence (0.3 ≤ HS < 0.4).';
|
| 278 |
+
} else {
|
| 279 |
+
riskBar.className = 'progress-bar bg-info';
|
| 280 |
+
document.getElementById('hallucination_details').textContent =
|
| 281 |
+
'Low risk of hallucination (HS < 0.3). Translation appears semantically faithful.';
|
| 282 |
+
}
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
document.getElementById('evaluationForm').addEventListener('submit', async function(e) {
|
| 286 |
+
e.preventDefault();
|
| 287 |
+
|
| 288 |
+
// Show loading spinner
|
| 289 |
+
document.querySelector('.loading').style.display = 'block';
|
| 290 |
+
document.getElementById('result').style.display = 'none';
|
| 291 |
+
|
| 292 |
+
// Get form data
|
| 293 |
+
const formData = new FormData(this);
|
| 294 |
+
|
| 295 |
+
try {
|
| 296 |
+
// Send request to backend
|
| 297 |
+
const response = await fetch('/evaluate', {
|
| 298 |
+
method: 'POST',
|
| 299 |
+
body: formData
|
| 300 |
+
});
|
| 301 |
+
|
| 302 |
+
const data = await response.json();
|
| 303 |
+
|
| 304 |
+
if (!data.success) {
|
| 305 |
+
throw new Error(data.error || 'Unknown error occurred');
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
// Update assessment display
|
| 309 |
+
updateAssessment(data);
|
| 310 |
+
} catch (error) {
|
| 311 |
+
console.error('Error:', error);
|
| 312 |
+
alert('An error occurred while evaluating the translation. Please try again.');
|
| 313 |
+
} finally {
|
| 314 |
+
// Hide loading spinner
|
| 315 |
+
document.querySelector('.loading').style.display = 'none';
|
| 316 |
+
}
|
| 317 |
+
});
|
| 318 |
+
</script>
|
| 319 |
+
</body>
|
| 320 |
+
</html>
|