Spaces:
Sleeping
Sleeping
""" | |
Advanced NovaEval Space by Noveum.ai | |
Comprehensive AI Model Evaluation Platform with Hugging Face Models | |
""" | |
import asyncio | |
import json | |
import logging | |
import os | |
import sys | |
import time | |
import uuid | |
from datetime import datetime | |
from typing import Dict, List, Optional, Any | |
import uvicorn | |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException | |
from fastapi.responses import HTMLResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
from pydantic import BaseModel | |
import httpx | |
import traceback | |
# Configure logging to stdout only (no file logging to avoid permission issues) | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
handlers=[logging.StreamHandler(sys.stdout)] | |
) | |
logger = logging.getLogger(__name__) | |
app = FastAPI( | |
title="NovaEval by Noveum.ai", | |
description="Advanced AI Model Evaluation Platform with Hugging Face Models", | |
version="2.0.0" | |
) | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
# Pydantic Models | |
class EvaluationRequest(BaseModel): | |
models: List[str] | |
dataset: str | |
metrics: List[str] | |
sample_size: int = 50 | |
temperature: float = 0.7 | |
max_tokens: int = 512 | |
top_p: float = 0.9 | |
class EvaluationResponse(BaseModel): | |
evaluation_id: str | |
status: str | |
message: str | |
# Global state | |
active_evaluations = {} | |
websocket_connections = {} | |
# Hugging Face Models Configuration | |
HF_MODELS = { | |
"small": [ | |
{ | |
"id": "google/flan-t5-large", | |
"name": "FLAN-T5 Large", | |
"size": "0.8B", | |
"description": "Best pretrained model around 1B parameters", | |
"capabilities": ["text-generation", "reasoning", "qa"], | |
"cost_per_1k": 0.0, | |
"provider": "Google" | |
}, | |
{ | |
"id": "Qwen/Qwen2.5-3B", | |
"name": "Qwen 2.5 3B", | |
"size": "3B", | |
"description": "Best pretrained model around 3B parameters", | |
"capabilities": ["text-generation", "reasoning", "multilingual"], | |
"cost_per_1k": 0.0, | |
"provider": "Alibaba" | |
}, | |
{ | |
"id": "google/gemma-2b", | |
"name": "Gemma 2B", | |
"size": "2B", | |
"description": "Efficient small model for general tasks", | |
"capabilities": ["text-generation", "reasoning"], | |
"cost_per_1k": 0.0, | |
"provider": "Google" | |
} | |
], | |
"medium": [ | |
{ | |
"id": "Qwen/Qwen2.5-7B", | |
"name": "Qwen 2.5 7B", | |
"size": "7B", | |
"description": "Best pretrained model around 7B parameters", | |
"capabilities": ["text-generation", "reasoning", "analysis"], | |
"cost_per_1k": 0.0, | |
"provider": "Alibaba" | |
}, | |
{ | |
"id": "mistralai/Mistral-7B-v0.1", | |
"name": "Mistral 7B", | |
"size": "7B", | |
"description": "Strong general purpose model", | |
"capabilities": ["text-generation", "reasoning", "analysis"], | |
"cost_per_1k": 0.0, | |
"provider": "Mistral AI" | |
}, | |
{ | |
"id": "microsoft/DialoGPT-medium", | |
"name": "DialoGPT Medium", | |
"size": "345M", | |
"description": "Conversational AI specialist", | |
"capabilities": ["conversation", "dialogue"], | |
"cost_per_1k": 0.0, | |
"provider": "Microsoft" | |
}, | |
{ | |
"id": "codellama/CodeLlama-7b-Python-hf", | |
"name": "CodeLlama 7B Python", | |
"size": "7B", | |
"description": "Code generation specialist", | |
"capabilities": ["code-generation", "python"], | |
"cost_per_1k": 0.0, | |
"provider": "Meta" | |
} | |
], | |
"large": [ | |
{ | |
"id": "Qwen/Qwen2.5-14B", | |
"name": "Qwen 2.5 14B", | |
"size": "14B", | |
"description": "Best pretrained model around 14B parameters", | |
"capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"], | |
"cost_per_1k": 0.0, | |
"provider": "Alibaba" | |
}, | |
{ | |
"id": "Qwen/Qwen2.5-32B", | |
"name": "Qwen 2.5 32B", | |
"size": "32B", | |
"description": "Best pretrained model around 32B parameters", | |
"capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"], | |
"cost_per_1k": 0.0, | |
"provider": "Alibaba" | |
}, | |
{ | |
"id": "Qwen/Qwen2.5-72B", | |
"name": "Qwen 2.5 72B", | |
"size": "72B", | |
"description": "Best pretrained model around 72B parameters", | |
"capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"], | |
"cost_per_1k": 0.0, | |
"provider": "Alibaba" | |
} | |
] | |
} | |
# Evaluation Datasets Configuration | |
EVALUATION_DATASETS = { | |
"reasoning": [ | |
{ | |
"id": "Rowan/hellaswag", | |
"name": "HellaSwag", | |
"description": "Commonsense reasoning benchmark", | |
"samples": 60000, | |
"task_type": "multiple_choice", | |
"difficulty": "medium" | |
}, | |
{ | |
"id": "tau/commonsense_qa", | |
"name": "CommonsenseQA", | |
"description": "Commonsense reasoning questions", | |
"samples": 12100, | |
"task_type": "multiple_choice", | |
"difficulty": "medium" | |
}, | |
{ | |
"id": "allenai/ai2_arc", | |
"name": "ARC (AI2 Reasoning Challenge)", | |
"description": "Science questions requiring reasoning", | |
"samples": 7790, | |
"task_type": "multiple_choice", | |
"difficulty": "hard" | |
} | |
], | |
"knowledge": [ | |
{ | |
"id": "cais/mmlu", | |
"name": "MMLU", | |
"description": "Massive Multitask Language Understanding", | |
"samples": 231000, | |
"task_type": "multiple_choice", | |
"difficulty": "hard" | |
}, | |
{ | |
"id": "google/boolq", | |
"name": "BoolQ", | |
"description": "Boolean questions requiring reading comprehension", | |
"samples": 12700, | |
"task_type": "yes_no", | |
"difficulty": "medium" | |
} | |
], | |
"math": [ | |
{ | |
"id": "openai/gsm8k", | |
"name": "GSM8K", | |
"description": "Grade school math word problems", | |
"samples": 17600, | |
"task_type": "generation", | |
"difficulty": "medium" | |
}, | |
{ | |
"id": "deepmind/aqua_rat", | |
"name": "AQUA-RAT", | |
"description": "Algebraic reasoning problems", | |
"samples": 196000, | |
"task_type": "multiple_choice", | |
"difficulty": "hard" | |
} | |
], | |
"code": [ | |
{ | |
"id": "openai/openai_humaneval", | |
"name": "HumanEval", | |
"description": "Python code generation benchmark", | |
"samples": 164, | |
"task_type": "code_generation", | |
"difficulty": "hard" | |
}, | |
{ | |
"id": "google-research-datasets/mbpp", | |
"name": "MBPP", | |
"description": "Mostly Basic Python Problems", | |
"samples": 1400, | |
"task_type": "code_generation", | |
"difficulty": "medium" | |
} | |
], | |
"language": [ | |
{ | |
"id": "stanfordnlp/imdb", | |
"name": "IMDB Reviews", | |
"description": "Movie review sentiment analysis", | |
"samples": 100000, | |
"task_type": "classification", | |
"difficulty": "easy" | |
}, | |
{ | |
"id": "abisee/cnn_dailymail", | |
"name": "CNN/DailyMail", | |
"description": "News article summarization", | |
"samples": 936000, | |
"task_type": "summarization", | |
"difficulty": "medium" | |
} | |
] | |
} | |
# Evaluation Metrics | |
EVALUATION_METRICS = [ | |
{ | |
"id": "accuracy", | |
"name": "Accuracy", | |
"description": "Percentage of correct predictions", | |
"applicable_tasks": ["multiple_choice", "yes_no", "classification"] | |
}, | |
{ | |
"id": "f1_score", | |
"name": "F1 Score", | |
"description": "Harmonic mean of precision and recall", | |
"applicable_tasks": ["classification", "multiple_choice"] | |
}, | |
{ | |
"id": "bleu", | |
"name": "BLEU Score", | |
"description": "Bilingual Evaluation Understudy for text generation", | |
"applicable_tasks": ["generation", "summarization", "code_generation"] | |
}, | |
{ | |
"id": "rouge", | |
"name": "ROUGE Score", | |
"description": "Recall-Oriented Understudy for Gisting Evaluation", | |
"applicable_tasks": ["summarization", "generation"] | |
}, | |
{ | |
"id": "pass_at_k", | |
"name": "Pass@K", | |
"description": "Percentage of problems solved correctly", | |
"applicable_tasks": ["code_generation"] | |
} | |
] | |
async def send_websocket_message(evaluation_id: str, message: dict): | |
"""Send message to WebSocket connection if exists""" | |
if evaluation_id in websocket_connections: | |
try: | |
await websocket_connections[evaluation_id].send_text(json.dumps(message)) | |
except Exception as e: | |
logger.error(f"Failed to send WebSocket message: {e}") | |
async def simulate_evaluation(evaluation_id: str, request: EvaluationRequest): | |
"""Simulate a real evaluation process with detailed logging""" | |
try: | |
# Initialize evaluation | |
active_evaluations[evaluation_id] = { | |
"status": "running", | |
"progress": 0, | |
"current_step": "Initializing", | |
"results": {}, | |
"logs": [], | |
"start_time": datetime.now() | |
} | |
total_steps = len(request.models) * 5 # 5 steps per model | |
current_step = 0 | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "INFO", | |
"message": f"🚀 Starting NovaEval evaluation with {len(request.models)} models" | |
}) | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "INFO", | |
"message": f"📊 Dataset: {request.dataset} | Sample size: {request.sample_size}" | |
}) | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "INFO", | |
"message": f"📏 Metrics: {', '.join(request.metrics)}" | |
}) | |
# Process each model | |
for model_id in request.models: | |
model_name = model_id.split('/')[-1] | |
# Step 1: Load model | |
current_step += 1 | |
await send_websocket_message(evaluation_id, { | |
"type": "progress", | |
"progress": (current_step / total_steps) * 100, | |
"current_step": f"Loading {model_name}" | |
}) | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "INFO", | |
"message": f"🤖 Loading model: {model_id}" | |
}) | |
await asyncio.sleep(2) # Simulate model loading time | |
# Step 2: Prepare dataset | |
current_step += 1 | |
await send_websocket_message(evaluation_id, { | |
"type": "progress", | |
"progress": (current_step / total_steps) * 100, | |
"current_step": f"Preparing dataset for {model_name}" | |
}) | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "INFO", | |
"message": f"📥 Loading dataset: {request.dataset}" | |
}) | |
await asyncio.sleep(1) | |
# Step 3: Run evaluation | |
current_step += 1 | |
await send_websocket_message(evaluation_id, { | |
"type": "progress", | |
"progress": (current_step / total_steps) * 100, | |
"current_step": f"Evaluating {model_name}" | |
}) | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "INFO", | |
"message": f"🧪 Running evaluation on {request.sample_size} samples" | |
}) | |
# Simulate processing samples | |
for i in range(0, request.sample_size, 10): | |
await asyncio.sleep(0.5) | |
processed = min(i + 10, request.sample_size) | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "DEBUG", | |
"message": f"📝 Processed {processed}/{request.sample_size} samples" | |
}) | |
# Step 4: Calculate metrics | |
current_step += 1 | |
await send_websocket_message(evaluation_id, { | |
"type": "progress", | |
"progress": (current_step / total_steps) * 100, | |
"current_step": f"Calculating metrics for {model_name}" | |
}) | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "INFO", | |
"message": f"📊 Calculating metrics: {', '.join(request.metrics)}" | |
}) | |
await asyncio.sleep(1) | |
# Step 5: Generate results | |
current_step += 1 | |
await send_websocket_message(evaluation_id, { | |
"type": "progress", | |
"progress": (current_step / total_steps) * 100, | |
"current_step": f"Finalizing results for {model_name}" | |
}) | |
# Generate realistic results | |
results = {} | |
for metric in request.metrics: | |
if metric == "accuracy": | |
results[metric] = round(0.65 + (hash(model_id) % 30) / 100, 3) | |
elif metric == "f1_score": | |
results[metric] = round(0.60 + (hash(model_id) % 35) / 100, 3) | |
elif metric == "bleu": | |
results[metric] = round(0.25 + (hash(model_id) % 40) / 100, 3) | |
elif metric == "rouge": | |
results[metric] = round(0.30 + (hash(model_id) % 35) / 100, 3) | |
elif metric == "pass_at_k": | |
results[metric] = round(0.15 + (hash(model_id) % 50) / 100, 3) | |
active_evaluations[evaluation_id]["results"][model_id] = results | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "SUCCESS", | |
"message": f"✅ {model_name} evaluation complete: {results}" | |
}) | |
await asyncio.sleep(1) | |
# Finalize evaluation | |
active_evaluations[evaluation_id]["status"] = "completed" | |
active_evaluations[evaluation_id]["progress"] = 100 | |
active_evaluations[evaluation_id]["end_time"] = datetime.now() | |
await send_websocket_message(evaluation_id, { | |
"type": "complete", | |
"results": active_evaluations[evaluation_id]["results"], | |
"message": "🎉 Evaluation completed successfully!" | |
}) | |
await send_websocket_message(evaluation_id, { | |
"type": "log", | |
"timestamp": datetime.now().isoformat(), | |
"level": "SUCCESS", | |
"message": "🎯 All evaluations completed successfully!" | |
}) | |
except Exception as e: | |
logger.error(f"Evaluation failed: {e}") | |
active_evaluations[evaluation_id]["status"] = "failed" | |
active_evaluations[evaluation_id]["error"] = str(e) | |
await send_websocket_message(evaluation_id, { | |
"type": "error", | |
"message": f"❌ Evaluation failed: {str(e)}" | |
}) | |
# API Endpoints | |
async def get_homepage(): | |
"""Serve the main application interface""" | |
return """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>NovaEval by Noveum.ai - Advanced AI Model Evaluation</title> | |
<script src="https://cdn.tailwindcss.com"></script> | |
<script src="https://unpkg.com/lucide@latest/dist/umd/lucide.js"></script> | |
<style> | |
.gradient-bg { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
} | |
.card-hover { | |
transition: all 0.3s ease; | |
} | |
.card-hover:hover { | |
transform: translateY(-2px); | |
box-shadow: 0 10px 25px rgba(0,0,0,0.1); | |
} | |
.model-card { | |
border: 2px solid transparent; | |
transition: all 0.3s ease; | |
} | |
.model-card.selected { | |
border-color: #667eea; | |
background: rgba(102, 126, 234, 0.1); | |
} | |
.progress-bar { | |
transition: width 0.5s ease; | |
} | |
.log-entry { | |
animation: slideIn 0.3s ease; | |
} | |
@keyframes slideIn { | |
from { opacity: 0; transform: translateX(-10px); } | |
to { opacity: 1; transform: translateX(0); } | |
} | |
.metric-badge { | |
background: linear-gradient(45deg, #667eea, #764ba2); | |
} | |
</style> | |
</head> | |
<body class="bg-gray-50 min-h-screen"> | |
<!-- Header --> | |
<header class="gradient-bg text-white py-6 shadow-lg"> | |
<div class="container mx-auto px-4"> | |
<div class="flex items-center justify-between"> | |
<div class="flex items-center space-x-3"> | |
<div class="w-10 h-10 bg-white rounded-lg flex items-center justify-center"> | |
<i data-lucide="zap" class="w-6 h-6 text-purple-600"></i> | |
</div> | |
<div> | |
<h1 class="text-2xl font-bold">NovaEval</h1> | |
<p class="text-purple-100 text-sm">by <a href="https://noveum.ai" target="_blank" class="underline hover:text-white">Noveum.ai</a></p> | |
</div> | |
</div> | |
<div class="text-right"> | |
<p class="text-purple-100 text-sm">Advanced AI Model Evaluation Platform</p> | |
<p class="text-purple-200 text-xs">Powered by Hugging Face Models</p> | |
</div> | |
</div> | |
</div> | |
</header> | |
<div class="container mx-auto px-4 py-8"> | |
<!-- Main Content --> | |
<div class="grid grid-cols-1 lg:grid-cols-3 gap-8"> | |
<!-- Left Panel - Configuration --> | |
<div class="lg:col-span-2 space-y-6"> | |
<!-- Model Selection --> | |
<div class="bg-white rounded-xl shadow-lg p-6 card-hover"> | |
<div class="flex items-center space-x-3 mb-6"> | |
<i data-lucide="cpu" class="w-6 h-6 text-purple-600"></i> | |
<h2 class="text-xl font-semibold text-gray-800">Select Models</h2> | |
</div> | |
<!-- Model Search --> | |
<div class="mb-4"> | |
<div class="relative"> | |
<input type="text" id="modelSearch" placeholder="Search models..." | |
class="w-full pl-10 pr-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent"> | |
<i data-lucide="search" class="w-5 h-5 text-gray-400 absolute left-3 top-2.5"></i> | |
</div> | |
</div> | |
<!-- Model Categories --> | |
<div class="mb-4"> | |
<div class="flex space-x-2"> | |
<button onclick="filterModels('all')" class="px-4 py-2 bg-purple-600 text-white rounded-lg text-sm hover:bg-purple-700 transition-colors" id="filter-all">All</button> | |
<button onclick="filterModels('small')" class="px-4 py-2 bg-gray-200 text-gray-700 rounded-lg text-sm hover:bg-gray-300 transition-colors" id="filter-small">Small (1-3B)</button> | |
<button onclick="filterModels('medium')" class="px-4 py-2 bg-gray-200 text-gray-700 rounded-lg text-sm hover:bg-gray-300 transition-colors" id="filter-medium">Medium (7B)</button> | |
<button onclick="filterModels('large')" class="px-4 py-2 bg-gray-200 text-gray-700 rounded-lg text-sm hover:bg-gray-300 transition-colors" id="filter-large">Large (14B+)</button> | |
</div> | |
</div> | |
<!-- Model Grid --> | |
<div id="modelGrid" class="grid grid-cols-1 md:grid-cols-2 gap-4 max-h-96 overflow-y-auto"> | |
<!-- Models will be populated by JavaScript --> | |
</div> | |
<div class="mt-4 text-sm text-gray-600"> | |
<span id="selectedModelsCount">0</span> models selected | |
</div> | |
</div> | |
<!-- Dataset Selection --> | |
<div class="bg-white rounded-xl shadow-lg p-6 card-hover"> | |
<div class="flex items-center space-x-3 mb-6"> | |
<i data-lucide="database" class="w-6 h-6 text-purple-600"></i> | |
<h2 class="text-xl font-semibold text-gray-800">Select Dataset</h2> | |
</div> | |
<!-- Dataset Categories --> | |
<div class="mb-4"> | |
<div class="flex flex-wrap gap-2"> | |
<button onclick="filterDatasets('all')" class="px-3 py-1 bg-purple-600 text-white rounded-full text-sm hover:bg-purple-700 transition-colors" id="dataset-filter-all">All</button> | |
<button onclick="filterDatasets('reasoning')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-reasoning">Reasoning</button> | |
<button onclick="filterDatasets('knowledge')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-knowledge">Knowledge</button> | |
<button onclick="filterDatasets('math')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-math">Math</button> | |
<button onclick="filterDatasets('code')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-code">Code</button> | |
<button onclick="filterDatasets('language')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-language">Language</button> | |
</div> | |
</div> | |
<!-- Dataset Grid --> | |
<div id="datasetGrid" class="space-y-3 max-h-64 overflow-y-auto"> | |
<!-- Datasets will be populated by JavaScript --> | |
</div> | |
</div> | |
<!-- Configuration --> | |
<div class="bg-white rounded-xl shadow-lg p-6 card-hover"> | |
<div class="flex items-center space-x-3 mb-6"> | |
<i data-lucide="settings" class="w-6 h-6 text-purple-600"></i> | |
<h2 class="text-xl font-semibold text-gray-800">Evaluation Configuration</h2> | |
</div> | |
<div class="grid grid-cols-1 md:grid-cols-2 gap-6"> | |
<!-- Metrics Selection --> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-3">Metrics</label> | |
<div id="metricsGrid" class="space-y-2"> | |
<!-- Metrics will be populated by JavaScript --> | |
</div> | |
</div> | |
<!-- Parameters --> | |
<div class="space-y-4"> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-2">Sample Size</label> | |
<input type="range" id="sampleSize" min="10" max="1000" value="50" | |
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer"> | |
<div class="flex justify-between text-xs text-gray-500 mt-1"> | |
<span>10</span> | |
<span id="sampleSizeValue">50</span> | |
<span>1000</span> | |
</div> | |
</div> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-2">Temperature</label> | |
<input type="range" id="temperature" min="0" max="2" step="0.1" value="0.7" | |
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer"> | |
<div class="flex justify-between text-xs text-gray-500 mt-1"> | |
<span>0.0</span> | |
<span id="temperatureValue">0.7</span> | |
<span>2.0</span> | |
</div> | |
</div> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-2">Max Tokens</label> | |
<input type="range" id="maxTokens" min="128" max="2048" step="128" value="512" | |
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer"> | |
<div class="flex justify-between text-xs text-gray-500 mt-1"> | |
<span>128</span> | |
<span id="maxTokensValue">512</span> | |
<span>2048</span> | |
</div> | |
</div> | |
</div> | |
</div> | |
<!-- Start Evaluation Button --> | |
<div class="mt-6"> | |
<button onclick="startEvaluation()" id="startBtn" | |
class="w-full gradient-bg text-white py-3 px-6 rounded-lg font-semibold hover:opacity-90 transition-opacity disabled:opacity-50 disabled:cursor-not-allowed"> | |
<i data-lucide="play" class="w-5 h-5 inline mr-2"></i> | |
Start Evaluation | |
</button> | |
</div> | |
</div> | |
</div> | |
<!-- Right Panel - Progress & Results --> | |
<div class="space-y-6"> | |
<!-- Progress --> | |
<div class="bg-white rounded-xl shadow-lg p-6 card-hover"> | |
<div class="flex items-center space-x-3 mb-4"> | |
<i data-lucide="activity" class="w-6 h-6 text-purple-600"></i> | |
<h2 class="text-xl font-semibold text-gray-800">Progress</h2> | |
</div> | |
<div id="progressSection" class="hidden"> | |
<div class="mb-4"> | |
<div class="flex justify-between text-sm text-gray-600 mb-2"> | |
<span id="currentStep">Initializing...</span> | |
<span id="progressPercent">0%</span> | |
</div> | |
<div class="w-full bg-gray-200 rounded-full h-2"> | |
<div id="progressBar" class="bg-gradient-to-r from-purple-500 to-blue-500 h-2 rounded-full progress-bar" style="width: 0%"></div> | |
</div> | |
</div> | |
</div> | |
<div id="idleMessage" class="text-center text-gray-500 py-8"> | |
<i data-lucide="clock" class="w-12 h-12 mx-auto mb-3 text-gray-300"></i> | |
<p>Ready to start evaluation</p> | |
</div> | |
</div> | |
<!-- Live Logs --> | |
<div class="bg-white rounded-xl shadow-lg p-6 card-hover"> | |
<div class="flex items-center space-x-3 mb-4"> | |
<i data-lucide="terminal" class="w-6 h-6 text-purple-600"></i> | |
<h2 class="text-xl font-semibold text-gray-800">Live Logs</h2> | |
</div> | |
<div id="logsContainer" class="bg-gray-900 text-green-400 p-4 rounded-lg h-64 overflow-y-auto font-mono text-sm"> | |
<div class="text-gray-500">Waiting for evaluation to start...</div> | |
</div> | |
</div> | |
<!-- Results --> | |
<div id="resultsSection" class="bg-white rounded-xl shadow-lg p-6 card-hover hidden"> | |
<div class="flex items-center space-x-3 mb-4"> | |
<i data-lucide="bar-chart" class="w-6 h-6 text-purple-600"></i> | |
<h2 class="text-xl font-semibold text-gray-800">Results</h2> | |
</div> | |
<div id="resultsContent"> | |
<!-- Results will be populated by JavaScript --> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<script> | |
// Global state | |
let selectedModels = []; | |
let selectedDataset = null; | |
let selectedMetrics = []; | |
let websocket = null; | |
let currentEvaluationId = null; | |
// Models data | |
const models = """ + json.dumps(HF_MODELS) + """; | |
const datasets = """ + json.dumps(EVALUATION_DATASETS) + """; | |
const metrics = """ + json.dumps(EVALUATION_METRICS) + """; | |
// Initialize the application | |
document.addEventListener('DOMContentLoaded', function() { | |
lucide.createIcons(); | |
renderModels(); | |
renderDatasets(); | |
renderMetrics(); | |
setupEventListeners(); | |
}); | |
function setupEventListeners() { | |
// Sample size slider | |
document.getElementById('sampleSize').addEventListener('input', function() { | |
document.getElementById('sampleSizeValue').textContent = this.value; | |
}); | |
// Temperature slider | |
document.getElementById('temperature').addEventListener('input', function() { | |
document.getElementById('temperatureValue').textContent = this.value; | |
}); | |
// Max tokens slider | |
document.getElementById('maxTokens').addEventListener('input', function() { | |
document.getElementById('maxTokensValue').textContent = this.value; | |
}); | |
// Model search | |
document.getElementById('modelSearch').addEventListener('input', function() { | |
const searchTerm = this.value.toLowerCase(); | |
filterModelsBySearch(searchTerm); | |
}); | |
} | |
function renderModels() { | |
const grid = document.getElementById('modelGrid'); | |
grid.innerHTML = ''; | |
Object.keys(models).forEach(category => { | |
models[category].forEach(model => { | |
const modelCard = createModelCard(model, category); | |
grid.appendChild(modelCard); | |
}); | |
}); | |
} | |
function createModelCard(model, category) { | |
const div = document.createElement('div'); | |
div.className = `model-card p-4 border rounded-lg cursor-pointer hover:shadow-md transition-all`; | |
div.dataset.category = category; | |
div.dataset.modelId = model.id; | |
div.innerHTML = ` | |
<div class="flex items-start justify-between mb-2"> | |
<div class="flex-1"> | |
<h3 class="font-semibold text-gray-800 text-sm">${model.name}</h3> | |
<p class="text-xs text-gray-500">${model.provider} • ${model.size}</p> | |
</div> | |
<div class="text-xs bg-gray-100 px-2 py-1 rounded">${model.size}</div> | |
</div> | |
<p class="text-xs text-gray-600 mb-2">${model.description}</p> | |
<div class="flex flex-wrap gap-1"> | |
${model.capabilities.map(cap => `<span class="text-xs bg-purple-100 text-purple-700 px-2 py-1 rounded">${cap}</span>`).join('')} | |
</div> | |
`; | |
div.addEventListener('click', () => toggleModelSelection(model.id, div)); | |
return div; | |
} | |
function toggleModelSelection(modelId, element) { | |
if (selectedModels.includes(modelId)) { | |
selectedModels = selectedModels.filter(id => id !== modelId); | |
element.classList.remove('selected'); | |
} else { | |
selectedModels.push(modelId); | |
element.classList.add('selected'); | |
} | |
updateSelectedModelsCount(); | |
} | |
function updateSelectedModelsCount() { | |
document.getElementById('selectedModelsCount').textContent = selectedModels.length; | |
} | |
function filterModels(category) { | |
// Update filter buttons | |
document.querySelectorAll('[id^="filter-"]').forEach(btn => { | |
btn.className = btn.className.replace('bg-purple-600 text-white', 'bg-gray-200 text-gray-700'); | |
}); | |
document.getElementById(`filter-${category}`).className = | |
document.getElementById(`filter-${category}`).className.replace('bg-gray-200 text-gray-700', 'bg-purple-600 text-white'); | |
// Filter model cards | |
document.querySelectorAll('.model-card').forEach(card => { | |
if (category === 'all' || card.dataset.category === category) { | |
card.style.display = 'block'; | |
} else { | |
card.style.display = 'none'; | |
} | |
}); | |
} | |
function filterModelsBySearch(searchTerm) { | |
document.querySelectorAll('.model-card').forEach(card => { | |
const modelName = card.querySelector('h3').textContent.toLowerCase(); | |
const modelProvider = card.querySelector('p').textContent.toLowerCase(); | |
if (modelName.includes(searchTerm) || modelProvider.includes(searchTerm)) { | |
card.style.display = 'block'; | |
} else { | |
card.style.display = 'none'; | |
} | |
}); | |
} | |
function renderDatasets() { | |
const grid = document.getElementById('datasetGrid'); | |
grid.innerHTML = ''; | |
Object.keys(datasets).forEach(category => { | |
datasets[category].forEach(dataset => { | |
const datasetCard = createDatasetCard(dataset, category); | |
grid.appendChild(datasetCard); | |
}); | |
}); | |
} | |
function createDatasetCard(dataset, category) { | |
const div = document.createElement('div'); | |
div.className = `dataset-card p-3 border rounded-lg cursor-pointer hover:shadow-md transition-all`; | |
div.dataset.category = category; | |
div.dataset.datasetId = dataset.id; | |
div.innerHTML = ` | |
<div class="flex items-start justify-between mb-2"> | |
<div class="flex-1"> | |
<h3 class="font-semibold text-gray-800 text-sm">${dataset.name}</h3> | |
<p class="text-xs text-gray-600">${dataset.description}</p> | |
</div> | |
<div class="text-xs bg-gray-100 px-2 py-1 rounded">${dataset.samples.toLocaleString()}</div> | |
</div> | |
<div class="flex justify-between items-center"> | |
<span class="text-xs bg-blue-100 text-blue-700 px-2 py-1 rounded">${dataset.task_type}</span> | |
<span class="text-xs text-gray-500">${dataset.difficulty}</span> | |
</div> | |
`; | |
div.addEventListener('click', () => selectDataset(dataset.id, div)); | |
return div; | |
} | |
function selectDataset(datasetId, element) { | |
// Remove previous selection | |
document.querySelectorAll('.dataset-card').forEach(card => { | |
card.classList.remove('selected'); | |
}); | |
// Add selection to clicked element | |
element.classList.add('selected'); | |
selectedDataset = datasetId; | |
} | |
function filterDatasets(category) { | |
// Update filter buttons | |
document.querySelectorAll('[id^="dataset-filter-"]').forEach(btn => { | |
btn.className = btn.className.replace('bg-purple-600 text-white', 'bg-gray-200 text-gray-700'); | |
}); | |
document.getElementById(`dataset-filter-${category}`).className = | |
document.getElementById(`dataset-filter-${category}`).className.replace('bg-gray-200 text-gray-700', 'bg-purple-600 text-white'); | |
// Filter dataset cards | |
document.querySelectorAll('.dataset-card').forEach(card => { | |
if (category === 'all' || card.dataset.category === category) { | |
card.style.display = 'block'; | |
} else { | |
card.style.display = 'none'; | |
} | |
}); | |
} | |
function renderMetrics() { | |
const grid = document.getElementById('metricsGrid'); | |
grid.innerHTML = ''; | |
metrics.forEach(metric => { | |
const div = document.createElement('div'); | |
div.className = 'flex items-center space-x-2'; | |
div.innerHTML = ` | |
<input type="checkbox" id="metric-${metric.id}" class="rounded text-purple-600 focus:ring-purple-500"> | |
<label for="metric-${metric.id}" class="text-sm text-gray-700 cursor-pointer">${metric.name}</label> | |
`; | |
const checkbox = div.querySelector('input'); | |
checkbox.addEventListener('change', () => { | |
if (checkbox.checked) { | |
selectedMetrics.push(metric.id); | |
} else { | |
selectedMetrics = selectedMetrics.filter(id => id !== metric.id); | |
} | |
}); | |
grid.appendChild(div); | |
}); | |
} | |
function startEvaluation() { | |
// Validation | |
if (selectedModels.length === 0) { | |
alert('Please select at least one model'); | |
return; | |
} | |
if (!selectedDataset) { | |
alert('Please select a dataset'); | |
return; | |
} | |
if (selectedMetrics.length === 0) { | |
alert('Please select at least one metric'); | |
return; | |
} | |
// Prepare request | |
const request = { | |
models: selectedModels, | |
dataset: selectedDataset, | |
metrics: selectedMetrics, | |
sample_size: parseInt(document.getElementById('sampleSize').value), | |
temperature: parseFloat(document.getElementById('temperature').value), | |
max_tokens: parseInt(document.getElementById('maxTokens').value), | |
top_p: 0.9 | |
}; | |
// Start evaluation | |
fetch('/api/evaluate', { | |
method: 'POST', | |
headers: { | |
'Content-Type': 'application/json' | |
}, | |
body: JSON.stringify(request) | |
}) | |
.then(response => response.json()) | |
.then(data => { | |
if (data.status === 'started') { | |
currentEvaluationId = data.evaluation_id; | |
connectWebSocket(data.evaluation_id); | |
showProgress(); | |
disableStartButton(); | |
} else { | |
alert('Failed to start evaluation: ' + data.message); | |
} | |
}) | |
.catch(error => { | |
console.error('Error:', error); | |
alert('Failed to start evaluation'); | |
}); | |
} | |
function connectWebSocket(evaluationId) { | |
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; | |
const wsUrl = `${protocol}//${window.location.host}/ws/${evaluationId}`; | |
websocket = new WebSocket(wsUrl); | |
websocket.onmessage = function(event) { | |
const data = JSON.parse(event.data); | |
handleWebSocketMessage(data); | |
}; | |
websocket.onclose = function() { | |
console.log('WebSocket connection closed'); | |
}; | |
websocket.onerror = function(error) { | |
console.error('WebSocket error:', error); | |
}; | |
} | |
function handleWebSocketMessage(data) { | |
switch (data.type) { | |
case 'progress': | |
updateProgress(data.progress, data.current_step); | |
break; | |
case 'log': | |
addLogEntry(data); | |
break; | |
case 'complete': | |
showResults(data.results); | |
enableStartButton(); | |
break; | |
case 'error': | |
addLogEntry({ | |
level: 'ERROR', | |
message: data.message, | |
timestamp: new Date().toISOString() | |
}); | |
enableStartButton(); | |
break; | |
} | |
} | |
function showProgress() { | |
document.getElementById('idleMessage').classList.add('hidden'); | |
document.getElementById('progressSection').classList.remove('hidden'); | |
clearLogs(); | |
} | |
function updateProgress(progress, currentStep) { | |
document.getElementById('progressBar').style.width = progress + '%'; | |
document.getElementById('progressPercent').textContent = Math.round(progress) + '%'; | |
document.getElementById('currentStep').textContent = currentStep; | |
} | |
function addLogEntry(logData) { | |
const container = document.getElementById('logsContainer'); | |
const entry = document.createElement('div'); | |
entry.className = 'log-entry mb-1'; | |
const timestamp = new Date(logData.timestamp).toLocaleTimeString(); | |
const levelColor = { | |
'INFO': 'text-blue-400', | |
'SUCCESS': 'text-green-400', | |
'ERROR': 'text-red-400', | |
'DEBUG': 'text-gray-400' | |
}[logData.level] || 'text-green-400'; | |
entry.innerHTML = ` | |
<span class="text-gray-500">[${timestamp}]</span> | |
<span class="${levelColor}">[${logData.level}]</span> | |
<span>${logData.message}</span> | |
`; | |
container.appendChild(entry); | |
container.scrollTop = container.scrollHeight; | |
} | |
function clearLogs() { | |
document.getElementById('logsContainer').innerHTML = ''; | |
} | |
function showResults(results) { | |
const section = document.getElementById('resultsSection'); | |
const content = document.getElementById('resultsContent'); | |
let html = '<div class="space-y-4">'; | |
Object.keys(results).forEach(modelId => { | |
const modelName = modelId.split('/').pop(); | |
const modelResults = results[modelId]; | |
html += ` | |
<div class="border rounded-lg p-4"> | |
<h3 class="font-semibold text-gray-800 mb-3">${modelName}</h3> | |
<div class="grid grid-cols-2 gap-3"> | |
`; | |
Object.keys(modelResults).forEach(metric => { | |
const value = modelResults[metric]; | |
html += ` | |
<div class="bg-gray-50 p-3 rounded"> | |
<div class="text-sm text-gray-600">${metric.toUpperCase()}</div> | |
<div class="text-lg font-semibold text-gray-800">${value}</div> | |
</div> | |
`; | |
}); | |
html += '</div></div>'; | |
}); | |
html += '</div>'; | |
content.innerHTML = html; | |
section.classList.remove('hidden'); | |
} | |
function disableStartButton() { | |
const btn = document.getElementById('startBtn'); | |
btn.disabled = true; | |
btn.innerHTML = '<i data-lucide="loader" class="w-5 h-5 inline mr-2 animate-spin"></i>Running Evaluation...'; | |
lucide.createIcons(); | |
} | |
function enableStartButton() { | |
const btn = document.getElementById('startBtn'); | |
btn.disabled = false; | |
btn.innerHTML = '<i data-lucide="play" class="w-5 h-5 inline mr-2"></i>Start Evaluation'; | |
lucide.createIcons(); | |
} | |
</script> | |
</body> | |
</html> | |
""" | |
async def get_models(): | |
"""Get available models""" | |
return {"models": HF_MODELS} | |
async def get_datasets(): | |
"""Get available datasets""" | |
return {"datasets": EVALUATION_DATASETS} | |
async def get_metrics(): | |
"""Get available metrics""" | |
return {"metrics": EVALUATION_METRICS} | |
async def start_evaluation(request: EvaluationRequest): | |
"""Start a new evaluation""" | |
evaluation_id = str(uuid.uuid4()) | |
# Start evaluation in background | |
asyncio.create_task(simulate_evaluation(evaluation_id, request)) | |
return EvaluationResponse( | |
evaluation_id=evaluation_id, | |
status="started", | |
message="Evaluation started successfully" | |
) | |
async def get_evaluation_status(evaluation_id: str): | |
"""Get evaluation status""" | |
if evaluation_id not in active_evaluations: | |
raise HTTPException(status_code=404, detail="Evaluation not found") | |
return active_evaluations[evaluation_id] | |
async def websocket_endpoint(websocket: WebSocket, evaluation_id: str): | |
"""WebSocket endpoint for real-time updates""" | |
await websocket.accept() | |
websocket_connections[evaluation_id] = websocket | |
try: | |
while True: | |
# Keep connection alive | |
await asyncio.sleep(1) | |
except WebSocketDisconnect: | |
if evaluation_id in websocket_connections: | |
del websocket_connections[evaluation_id] | |
async def health_check(): | |
"""Health check endpoint""" | |
return {"status": "healthy", "timestamp": datetime.now().isoformat()} | |
if __name__ == "__main__": | |
uvicorn.run(app, host="0.0.0.0", port=7860) | |