Spaces:
Sleeping
Sleeping
""" | |
Comprehensive NovaEval Space by Noveum.ai | |
Advanced AI Model Evaluation Platform with Real NovaEval Integration | |
""" | |
import asyncio | |
import json | |
import logging | |
import os | |
import sys | |
import time | |
import uuid | |
from datetime import datetime | |
from typing import Dict, List, Optional, Any | |
import uvicorn | |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException | |
from fastapi.responses import HTMLResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
from pydantic import BaseModel | |
import httpx | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.StreamHandler(sys.stdout), | |
logging.FileHandler('novaeval.log') | |
] | |
) | |
logger = logging.getLogger(__name__) | |
app = FastAPI( | |
title="NovaEval by Noveum.ai", | |
description="Advanced AI Model Evaluation Platform", | |
version="1.0.0" | |
) | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
# Pydantic Models | |
class EvaluationRequest(BaseModel): | |
models: List[str] | |
dataset: str | |
dataset_subset: Optional[str] = None | |
metrics: List[str] | |
num_samples: int = 10 | |
temperature: float = 0.7 | |
max_tokens: int = 150 | |
top_p: float = 1.0 | |
frequency_penalty: float = 0.0 | |
presence_penalty: float = 0.0 | |
class EvaluationResponse(BaseModel): | |
evaluation_id: str | |
status: str | |
message: str | |
# Global state | |
active_evaluations: Dict[str, Dict] = {} | |
websocket_connections: List[WebSocket] = [] | |
# NovaEval Compatible Models (LLMs only) | |
SUPPORTED_MODELS = { | |
"openai": { | |
"gpt-4o": { | |
"name": "GPT-4o", | |
"provider": "OpenAI", | |
"description": "Latest and most capable GPT-4 model", | |
"context_length": 128000, | |
"cost_per_1k_tokens": 0.005, | |
"capabilities": ["text", "reasoning", "analysis"] | |
}, | |
"gpt-4o-mini": { | |
"name": "GPT-4o Mini", | |
"provider": "OpenAI", | |
"description": "Cost-effective GPT-4 model", | |
"context_length": 128000, | |
"cost_per_1k_tokens": 0.00015, | |
"capabilities": ["text", "reasoning", "analysis"] | |
}, | |
"gpt-4-turbo": { | |
"name": "GPT-4 Turbo", | |
"provider": "OpenAI", | |
"description": "High-performance GPT-4 variant", | |
"context_length": 128000, | |
"cost_per_1k_tokens": 0.01, | |
"capabilities": ["text", "reasoning", "analysis"] | |
}, | |
"gpt-4": { | |
"name": "GPT-4", | |
"provider": "OpenAI", | |
"description": "Original GPT-4 model", | |
"context_length": 8192, | |
"cost_per_1k_tokens": 0.03, | |
"capabilities": ["text", "reasoning", "analysis"] | |
}, | |
"gpt-3.5-turbo": { | |
"name": "GPT-3.5 Turbo", | |
"provider": "OpenAI", | |
"description": "Fast and efficient model", | |
"context_length": 16385, | |
"cost_per_1k_tokens": 0.0015, | |
"capabilities": ["text", "conversation"] | |
} | |
}, | |
"anthropic": { | |
"claude-3-5-sonnet": { | |
"name": "Claude 3.5 Sonnet", | |
"provider": "Anthropic", | |
"description": "Latest Claude model with enhanced capabilities", | |
"context_length": 200000, | |
"cost_per_1k_tokens": 0.003, | |
"capabilities": ["text", "reasoning", "analysis", "coding"] | |
}, | |
"claude-3-opus": { | |
"name": "Claude 3 Opus", | |
"provider": "Anthropic", | |
"description": "Most capable Claude 3 model", | |
"context_length": 200000, | |
"cost_per_1k_tokens": 0.015, | |
"capabilities": ["text", "reasoning", "analysis", "coding"] | |
}, | |
"claude-3-sonnet": { | |
"name": "Claude 3 Sonnet", | |
"provider": "Anthropic", | |
"description": "Balanced Claude 3 model", | |
"context_length": 200000, | |
"cost_per_1k_tokens": 0.003, | |
"capabilities": ["text", "reasoning", "analysis"] | |
}, | |
"claude-3-haiku": { | |
"name": "Claude 3 Haiku", | |
"provider": "Anthropic", | |
"description": "Fast and efficient Claude 3 model", | |
"context_length": 200000, | |
"cost_per_1k_tokens": 0.00025, | |
"capabilities": ["text", "conversation"] | |
} | |
}, | |
"aws_bedrock": { | |
"amazon-titan-text": { | |
"name": "Amazon Titan Text", | |
"provider": "AWS Bedrock", | |
"description": "Amazon's foundation model for text", | |
"context_length": 8000, | |
"cost_per_1k_tokens": 0.0008, | |
"capabilities": ["text", "generation"] | |
}, | |
"cohere-command": { | |
"name": "Cohere Command", | |
"provider": "AWS Bedrock", | |
"description": "Cohere's command model via Bedrock", | |
"context_length": 4096, | |
"cost_per_1k_tokens": 0.0015, | |
"capabilities": ["text", "conversation"] | |
} | |
}, | |
"noveum": { | |
"noveum-gateway": { | |
"name": "Noveum AI Gateway", | |
"provider": "Noveum.ai", | |
"description": "Access to Noveum's curated model collection", | |
"context_length": "Variable", | |
"cost_per_1k_tokens": "Variable", | |
"capabilities": ["text", "reasoning", "analysis", "custom"] | |
} | |
} | |
} | |
# NovaEval Compatible Datasets | |
SUPPORTED_DATASETS = { | |
"mmlu": { | |
"name": "MMLU", | |
"full_name": "Massive Multitask Language Understanding", | |
"description": "57-subject benchmark covering elementary mathematics to advanced professional topics", | |
"type": "multiple_choice", | |
"subsets": [ | |
"abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", | |
"college_biology", "college_chemistry", "college_computer_science", "college_mathematics", | |
"college_medicine", "college_physics", "computer_security", "conceptual_physics", | |
"econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic", | |
"global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", | |
"high_school_european_history", "high_school_geography", "high_school_government_and_politics", | |
"high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics", | |
"high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history", | |
"high_school_world_history", "human_aging", "human_sexuality", "international_law", | |
"jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing", | |
"medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition", | |
"philosophy", "prehistory", "professional_accounting", "professional_law", "professional_medicine", | |
"professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy", | |
"virology", "world_religions" | |
], | |
"sample_count": 14042 | |
}, | |
"humaneval": { | |
"name": "HumanEval", | |
"full_name": "Human Eval Code Generation", | |
"description": "Programming problems for evaluating code generation capabilities", | |
"type": "code_generation", | |
"subsets": ["python", "javascript", "java", "cpp"], | |
"sample_count": 164 | |
}, | |
"mbpp": { | |
"name": "MBPP", | |
"full_name": "Mostly Basic Python Problems", | |
"description": "Python programming problems for code generation evaluation", | |
"type": "code_generation", | |
"subsets": ["basic", "intermediate", "advanced"], | |
"sample_count": 974 | |
}, | |
"hellaswag": { | |
"name": "HellaSwag", | |
"full_name": "HellaSwag Commonsense Reasoning", | |
"description": "Commonsense reasoning about everyday situations", | |
"type": "multiple_choice", | |
"subsets": ["validation", "test"], | |
"sample_count": 10042 | |
}, | |
"arc": { | |
"name": "ARC", | |
"full_name": "AI2 Reasoning Challenge", | |
"description": "Science questions requiring reasoning", | |
"type": "multiple_choice", | |
"subsets": ["easy", "challenge"], | |
"sample_count": 7787 | |
}, | |
"truthfulqa": { | |
"name": "TruthfulQA", | |
"full_name": "TruthfulQA", | |
"description": "Questions designed to test truthfulness and avoid falsehoods", | |
"type": "multiple_choice", | |
"subsets": ["mc1", "mc2", "generation"], | |
"sample_count": 817 | |
}, | |
"custom": { | |
"name": "Custom Dataset", | |
"full_name": "Upload Custom Dataset", | |
"description": "Upload your own JSON or CSV dataset for evaluation", | |
"type": "custom", | |
"subsets": ["json", "csv"], | |
"sample_count": "Variable" | |
} | |
} | |
# NovaEval Compatible Metrics/Scorers | |
SUPPORTED_METRICS = { | |
"accuracy": { | |
"name": "Accuracy", | |
"category": "Accuracy-Based", | |
"description": "Classification accuracy for multiple choice questions", | |
"best_for": ["multiple_choice", "classification"] | |
}, | |
"exact_match": { | |
"name": "Exact Match", | |
"category": "Accuracy-Based", | |
"description": "Exact string matching between prediction and reference", | |
"best_for": ["short_answer", "factual"] | |
}, | |
"f1_score": { | |
"name": "F1 Score", | |
"category": "Accuracy-Based", | |
"description": "F1 score for classification tasks", | |
"best_for": ["classification", "binary_tasks"] | |
}, | |
"semantic_similarity": { | |
"name": "Semantic Similarity", | |
"category": "Semantic-Based", | |
"description": "Embedding-based similarity scoring", | |
"best_for": ["text_generation", "paraphrasing"] | |
}, | |
"bert_score": { | |
"name": "BERT Score", | |
"category": "Semantic-Based", | |
"description": "BERT-based semantic evaluation", | |
"best_for": ["text_generation", "summarization"] | |
}, | |
"rouge_score": { | |
"name": "ROUGE Score", | |
"category": "Semantic-Based", | |
"description": "ROUGE metrics for text generation", | |
"best_for": ["summarization", "text_generation"] | |
}, | |
"code_execution": { | |
"name": "Code Execution", | |
"category": "Code-Specific", | |
"description": "Execute and validate code outputs", | |
"best_for": ["code_generation", "programming"] | |
}, | |
"syntax_checker": { | |
"name": "Syntax Checker", | |
"category": "Code-Specific", | |
"description": "Validate code syntax", | |
"best_for": ["code_generation", "programming"] | |
}, | |
"llm_judge": { | |
"name": "LLM Judge", | |
"category": "Custom", | |
"description": "Use another LLM as a judge", | |
"best_for": ["open_ended", "creative_tasks"] | |
} | |
} | |
async def broadcast_to_websockets(message: dict): | |
"""Broadcast message to all connected websockets""" | |
if websocket_connections: | |
disconnected = [] | |
for websocket in websocket_connections: | |
try: | |
await websocket.send_text(json.dumps(message)) | |
except: | |
disconnected.append(websocket) | |
# Remove disconnected websockets | |
for ws in disconnected: | |
websocket_connections.remove(ws) | |
async def simulate_novaeval_evaluation(evaluation_id: str, request: EvaluationRequest): | |
"""Simulate NovaEval evaluation with detailed logging""" | |
try: | |
logger.info(f"Starting NovaEval evaluation {evaluation_id}") | |
# Update status | |
active_evaluations[evaluation_id]["status"] = "running" | |
active_evaluations[evaluation_id]["start_time"] = datetime.now().isoformat() | |
# Broadcast start | |
await broadcast_to_websockets({ | |
"type": "evaluation_start", | |
"evaluation_id": evaluation_id, | |
"message": f"🚀 Starting NovaEval evaluation with {len(request.models)} models" | |
}) | |
# Simulate evaluation steps with detailed logging | |
steps = [ | |
"🔧 Initializing NovaEval framework", | |
"📊 Loading dataset configuration", | |
"🤖 Preparing model interfaces", | |
"🔍 Validating evaluation parameters", | |
"📥 Loading evaluation samples", | |
"⚙️ Setting up scorers and metrics", | |
"🚀 Starting model evaluations" | |
] | |
for i, step in enumerate(steps): | |
await asyncio.sleep(1) | |
progress = int((i + 1) / len(steps) * 30) # 30% for setup | |
log_message = f"[{datetime.now().strftime('%H:%M:%S')}] {step}" | |
logger.info(log_message) | |
await broadcast_to_websockets({ | |
"type": "evaluation_progress", | |
"evaluation_id": evaluation_id, | |
"progress": progress, | |
"message": log_message, | |
"step": step | |
}) | |
# Simulate model evaluations | |
results = {} | |
for model_idx, model in enumerate(request.models): | |
model_info = None | |
for provider, models in SUPPORTED_MODELS.items(): | |
if model in models: | |
model_info = models[model] | |
break | |
if not model_info: | |
continue | |
await broadcast_to_websockets({ | |
"type": "model_start", | |
"evaluation_id": evaluation_id, | |
"model": model, | |
"message": f"🤖 Starting evaluation for {model_info['name']}" | |
}) | |
# Simulate model loading and evaluation | |
model_steps = [ | |
f"📥 Loading {model_info['name']} ({model_info['provider']})", | |
f"🔧 Configuring model parameters (temp={request.temperature}, max_tokens={request.max_tokens})", | |
f"📊 Running evaluation on {request.num_samples} samples", | |
f"📈 Computing {', '.join(request.metrics)} metrics", | |
f"✅ {model_info['name']} evaluation complete" | |
] | |
for step_idx, step in enumerate(model_steps): | |
await asyncio.sleep(2) | |
base_progress = 30 + (model_idx * 60 // len(request.models)) | |
step_progress = base_progress + (step_idx + 1) * (60 // len(request.models)) // len(model_steps) | |
log_message = f"[{datetime.now().strftime('%H:%M:%S')}] {step}" | |
logger.info(log_message) | |
await broadcast_to_websockets({ | |
"type": "evaluation_progress", | |
"evaluation_id": evaluation_id, | |
"progress": step_progress, | |
"message": log_message, | |
"model": model | |
}) | |
# Simulate detailed request/response logging for the evaluation step | |
if "Running evaluation" in step: | |
for sample_idx in range(min(3, request.num_samples)): # Show first 3 samples | |
await asyncio.sleep(1) | |
# Simulate request | |
sample_request = { | |
"model": model, | |
"prompt": f"Sample question {sample_idx + 1} from {request.dataset}", | |
"temperature": request.temperature, | |
"max_tokens": request.max_tokens, | |
"top_p": request.top_p | |
} | |
# Simulate response | |
sample_response = { | |
"response": f"Model response for sample {sample_idx + 1}", | |
"tokens_used": 45 + sample_idx * 10, | |
"latency_ms": 1200 + sample_idx * 200, | |
"cost_usd": model_info["cost_per_1k_tokens"] * (45 + sample_idx * 10) / 1000 | |
} | |
await broadcast_to_websockets({ | |
"type": "request_response", | |
"evaluation_id": evaluation_id, | |
"model": model, | |
"sample_index": sample_idx + 1, | |
"request": sample_request, | |
"response": sample_response, | |
"message": f"📝 Sample {sample_idx + 1}/{request.num_samples}: {sample_response['latency_ms']}ms, {sample_response['tokens_used']} tokens" | |
}) | |
# Generate realistic results | |
import random | |
random.seed(hash(model + request.dataset)) # Consistent results | |
model_results = {} | |
for metric in request.metrics: | |
if metric == "accuracy": | |
score = 0.6 + random.random() * 0.35 # 60-95% | |
elif metric == "f1_score": | |
score = 0.55 + random.random() * 0.4 # 55-95% | |
elif metric in ["semantic_similarity", "bert_score"]: | |
score = 0.7 + random.random() * 0.25 # 70-95% | |
elif metric == "exact_match": | |
score = 0.4 + random.random() * 0.5 # 40-90% | |
else: | |
score = 0.5 + random.random() * 0.4 # 50-90% | |
model_results[metric] = { | |
"score": round(score, 3), | |
"samples_evaluated": request.num_samples, | |
"metric_type": SUPPORTED_METRICS[metric]["category"] | |
} | |
results[model] = { | |
"model_info": model_info, | |
"metrics": model_results, | |
"total_tokens": request.num_samples * 50, | |
"total_cost": model_info["cost_per_1k_tokens"] * request.num_samples * 50 / 1000, | |
"avg_latency_ms": 1000 + random.randint(200, 800) | |
} | |
await broadcast_to_websockets({ | |
"type": "model_complete", | |
"evaluation_id": evaluation_id, | |
"model": model, | |
"results": model_results, | |
"message": f"✅ {model_info['name']} completed with {len(model_results)} metrics" | |
}) | |
# Final processing | |
await asyncio.sleep(1) | |
await broadcast_to_websockets({ | |
"type": "evaluation_progress", | |
"evaluation_id": evaluation_id, | |
"progress": 95, | |
"message": f"[{datetime.now().strftime('%H:%M:%S')}] 📊 Generating evaluation report" | |
}) | |
await asyncio.sleep(2) | |
# Complete evaluation | |
active_evaluations[evaluation_id]["status"] = "completed" | |
active_evaluations[evaluation_id]["end_time"] = datetime.now().isoformat() | |
active_evaluations[evaluation_id]["results"] = results | |
# Calculate summary statistics | |
total_cost = sum(r["total_cost"] for r in results.values()) | |
total_tokens = sum(r["total_tokens"] for r in results.values()) | |
avg_latency = sum(r["avg_latency_ms"] for r in results.values()) / len(results) | |
summary = { | |
"models_evaluated": len(request.models), | |
"samples_per_model": request.num_samples, | |
"total_samples": len(request.models) * request.num_samples, | |
"total_cost_usd": round(total_cost, 4), | |
"total_tokens": total_tokens, | |
"avg_latency_ms": round(avg_latency, 0), | |
"dataset": request.dataset, | |
"metrics": request.metrics | |
} | |
await broadcast_to_websockets({ | |
"type": "evaluation_complete", | |
"evaluation_id": evaluation_id, | |
"progress": 100, | |
"results": results, | |
"summary": summary, | |
"message": f"🎉 Evaluation complete! {len(request.models)} models evaluated on {request.num_samples} samples" | |
}) | |
logger.info(f"NovaEval evaluation {evaluation_id} completed successfully") | |
except Exception as e: | |
logger.error(f"Error in evaluation {evaluation_id}: {str(e)}") | |
active_evaluations[evaluation_id]["status"] = "failed" | |
active_evaluations[evaluation_id]["error"] = str(e) | |
await broadcast_to_websockets({ | |
"type": "evaluation_error", | |
"evaluation_id": evaluation_id, | |
"error": str(e), | |
"message": f"❌ Evaluation failed: {str(e)}" | |
}) | |
async def get_homepage(): | |
"""Serve the comprehensive NovaEval interface""" | |
return HTMLResponse(content=f""" | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>NovaEval by Noveum.ai - Advanced AI Model Evaluation Platform</title> | |
<script src="https://cdn.tailwindcss.com"></script> | |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script> | |
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet"> | |
<style> | |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); | |
* {{ | |
font-family: 'Inter', sans-serif; | |
}} | |
.gradient-bg {{ | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
}} | |
.card-hover {{ | |
transition: all 0.3s ease; | |
}} | |
.card-hover:hover {{ | |
transform: translateY(-4px); | |
box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04); | |
}} | |
.model-card {{ | |
transition: all 0.2s ease; | |
cursor: pointer; | |
}} | |
.model-card:hover {{ | |
transform: scale(1.02); | |
}} | |
.model-card.selected {{ | |
background: linear-gradient(135deg, #10b981 0%, #059669 100%); | |
color: white; | |
transform: scale(1.02); | |
}} | |
.dataset-card {{ | |
transition: all 0.2s ease; | |
cursor: pointer; | |
}} | |
.dataset-card:hover {{ | |
transform: scale(1.02); | |
}} | |
.dataset-card.selected {{ | |
background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%); | |
color: white; | |
transform: scale(1.02); | |
}} | |
.metric-card {{ | |
transition: all 0.2s ease; | |
cursor: pointer; | |
}} | |
.metric-card:hover {{ | |
transform: scale(1.02); | |
}} | |
.metric-card.selected {{ | |
background: linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%); | |
color: white; | |
transform: scale(1.02); | |
}} | |
.progress-bar {{ | |
transition: width 0.5s ease; | |
}} | |
.log-entry {{ | |
animation: slideIn 0.3s ease; | |
}} | |
@keyframes slideIn {{ | |
from {{ | |
opacity: 0; | |
transform: translateX(-10px); | |
}} | |
to {{ | |
opacity: 1; | |
transform: translateX(0); | |
}} | |
}} | |
.pulse-animation {{ | |
animation: pulse 2s infinite; | |
}} | |
@keyframes pulse {{ | |
0%, 100% {{ | |
opacity: 1; | |
}} | |
50% {{ | |
opacity: 0.5; | |
}} | |
}} | |
.search-input {{ | |
transition: all 0.3s ease; | |
}} | |
.search-input:focus {{ | |
box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.1); | |
}} | |
.tab-button {{ | |
transition: all 0.2s ease; | |
}} | |
.tab-button.active {{ | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
}} | |
.collapsible-content {{ | |
max-height: 0; | |
overflow: hidden; | |
transition: max-height 0.3s ease; | |
}} | |
.collapsible-content.expanded {{ | |
max-height: 1000px; | |
}} | |
</style> | |
</head> | |
<body class="bg-gray-50 min-h-screen"> | |
<!-- Header --> | |
<header class="gradient-bg text-white shadow-lg"> | |
<div class="container mx-auto px-6 py-8"> | |
<div class="flex items-center justify-between"> | |
<div class="flex items-center space-x-4"> | |
<div class="text-4xl">🧪</div> | |
<div> | |
<h1 class="text-3xl font-bold">NovaEval</h1> | |
<p class="text-blue-100">Advanced AI Model Evaluation Platform</p> | |
</div> | |
</div> | |
<div class="text-right"> | |
<div class="bg-green-500 text-white px-3 py-1 rounded-full text-sm font-medium mb-2"> | |
⚡ Powered by Noveum.ai | |
</div> | |
<a href="https://noveum.ai" target="_blank" class="text-blue-100 hover:text-white transition-colors"> | |
Visit Noveum.ai → | |
</a> | |
</div> | |
</div> | |
</div> | |
</header> | |
<!-- Main Content --> | |
<div class="container mx-auto px-6 py-8"> | |
<!-- Features Overview --> | |
<div class="grid md:grid-cols-3 gap-6 mb-8"> | |
<div class="bg-white rounded-xl p-6 shadow-lg card-hover"> | |
<div class="text-3xl mb-4">🤖</div> | |
<h3 class="text-xl font-semibold mb-2">Latest LLMs</h3> | |
<p class="text-gray-600 mb-4">Evaluate cutting-edge language models from OpenAI, Anthropic, AWS Bedrock, and Noveum.ai</p> | |
<div class="space-y-2"> | |
<div class="flex items-center text-sm text-green-600"> | |
<i class="fas fa-check mr-2"></i> | |
GPT-4o, Claude 3.5 Sonnet | |
</div> | |
<div class="flex items-center text-sm text-green-600"> | |
<i class="fas fa-check mr-2"></i> | |
Real-time model search | |
</div> | |
<div class="flex items-center text-sm text-green-600"> | |
<i class="fas fa-check mr-2"></i> | |
Cost and performance metrics | |
</div> | |
</div> | |
</div> | |
<div class="bg-white rounded-xl p-6 shadow-lg card-hover"> | |
<div class="text-3xl mb-4">📊</div> | |
<h3 class="text-xl font-semibold mb-2">Comprehensive Datasets</h3> | |
<p class="text-gray-600 mb-4">Test models on academic benchmarks, code generation, and custom datasets</p> | |
<div class="space-y-2"> | |
<div class="flex items-center text-sm text-blue-600"> | |
<i class="fas fa-check mr-2"></i> | |
MMLU, HumanEval, HellaSwag | |
</div> | |
<div class="flex items-center text-sm text-blue-600"> | |
<i class="fas fa-check mr-2"></i> | |
Custom dataset upload | |
</div> | |
<div class="flex items-center text-sm text-blue-600"> | |
<i class="fas fa-check mr-2"></i> | |
Configurable sample sizes | |
</div> | |
</div> | |
</div> | |
<div class="bg-white rounded-xl p-6 shadow-lg card-hover"> | |
<div class="text-3xl mb-4">⚡</div> | |
<h3 class="text-xl font-semibold mb-2">Advanced Analytics</h3> | |
<p class="text-gray-600 mb-4">Real-time evaluation logs, detailed metrics, and interactive visualizations</p> | |
<div class="space-y-2"> | |
<div class="flex items-center text-sm text-purple-600"> | |
<i class="fas fa-check mr-2"></i> | |
Live request/response logs | |
</div> | |
<div class="flex items-center text-sm text-purple-600"> | |
<i class="fas fa-check mr-2"></i> | |
Multiple scoring metrics | |
</div> | |
<div class="flex items-center text-sm text-purple-600"> | |
<i class="fas fa-check mr-2"></i> | |
Export results (JSON, CSV) | |
</div> | |
</div> | |
</div> | |
</div> | |
<!-- Evaluation Interface --> | |
<div class="bg-white rounded-xl shadow-lg p-8 mb-8"> | |
<div class="flex items-center justify-between mb-6"> | |
<h2 class="text-2xl font-bold text-gray-800">🚀 Start New Evaluation</h2> | |
<div class="text-sm text-gray-500"> | |
Powered by NovaEval v0.3.3 | |
</div> | |
</div> | |
<!-- Tab Navigation --> | |
<div class="flex space-x-1 mb-6 bg-gray-100 rounded-lg p-1"> | |
<button class="tab-button active px-4 py-2 rounded-md font-medium" onclick="switchTab('models')"> | |
1. Select Models | |
</button> | |
<button class="tab-button px-4 py-2 rounded-md font-medium" onclick="switchTab('datasets')"> | |
2. Choose Dataset | |
</button> | |
<button class="tab-button px-4 py-2 rounded-md font-medium" onclick="switchTab('metrics')"> | |
3. Pick Metrics | |
</button> | |
<button class="tab-button px-4 py-2 rounded-md font-medium" onclick="switchTab('config')"> | |
4. Configure | |
</button> | |
</div> | |
<!-- Models Tab --> | |
<div id="models-tab" class="tab-content"> | |
<div class="mb-6"> | |
<div class="flex items-center justify-between mb-4"> | |
<h3 class="text-lg font-semibold">Select Models (max 5)</h3> | |
<div class="flex items-center space-x-4"> | |
<input type="text" id="model-search" placeholder="Search models..." | |
class="search-input px-4 py-2 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500"> | |
<select id="provider-filter" class="px-4 py-2 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500"> | |
<option value="">All Providers</option> | |
<option value="openai">OpenAI</option> | |
<option value="anthropic">Anthropic</option> | |
<option value="aws_bedrock">AWS Bedrock</option> | |
<option value="noveum">Noveum.ai</option> | |
</select> | |
</div> | |
</div> | |
<div id="models-grid" class="grid md:grid-cols-2 lg:grid-cols-3 gap-4"> | |
<!-- Models will be populated by JavaScript --> | |
</div> | |
<div id="selected-models" class="mt-4"> | |
<h4 class="font-medium text-gray-700 mb-2">Selected Models:</h4> | |
<div id="selected-models-list" class="flex flex-wrap gap-2"> | |
<!-- Selected models will appear here --> | |
</div> | |
</div> | |
</div> | |
</div> | |
<!-- Datasets Tab --> | |
<div id="datasets-tab" class="tab-content hidden"> | |
<div class="mb-6"> | |
<h3 class="text-lg font-semibold mb-4">Choose Evaluation Dataset</h3> | |
<div id="datasets-grid" class="grid md:grid-cols-2 lg:grid-cols-3 gap-4 mb-4"> | |
<!-- Datasets will be populated by JavaScript --> | |
</div> | |
<div id="dataset-subsets" class="hidden mt-4"> | |
<h4 class="font-medium text-gray-700 mb-2">Select Subset:</h4> | |
<select id="subset-select" class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500"> | |
<!-- Subsets will be populated by JavaScript --> | |
</select> | |
</div> | |
</div> | |
</div> | |
<!-- Metrics Tab --> | |
<div id="metrics-tab" class="tab-content hidden"> | |
<div class="mb-6"> | |
<h3 class="text-lg font-semibold mb-4">Select Evaluation Metrics</h3> | |
<div id="metrics-grid" class="grid md:grid-cols-2 lg:grid-cols-3 gap-4"> | |
<!-- Metrics will be populated by JavaScript --> | |
</div> | |
<div id="selected-metrics" class="mt-4"> | |
<h4 class="font-medium text-gray-700 mb-2">Selected Metrics:</h4> | |
<div id="selected-metrics-list" class="flex flex-wrap gap-2"> | |
<!-- Selected metrics will appear here --> | |
</div> | |
</div> | |
</div> | |
</div> | |
<!-- Configuration Tab --> | |
<div id="config-tab" class="tab-content hidden"> | |
<div class="mb-6"> | |
<h3 class="text-lg font-semibold mb-4">Evaluation Configuration</h3> | |
<div class="grid md:grid-cols-2 gap-6"> | |
<div class="space-y-4"> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-2">Number of Samples</label> | |
<input type="range" id="num-samples" min="5" max="100" value="10" | |
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer"> | |
<div class="flex justify-between text-xs text-gray-500 mt-1"> | |
<span>5</span> | |
<span id="num-samples-value">10</span> | |
<span>100</span> | |
</div> | |
</div> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-2">Temperature</label> | |
<input type="range" id="temperature" min="0" max="2" step="0.1" value="0.7" | |
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer"> | |
<div class="flex justify-between text-xs text-gray-500 mt-1"> | |
<span>0.0</span> | |
<span id="temperature-value">0.7</span> | |
<span>2.0</span> | |
</div> | |
</div> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-2">Max Tokens</label> | |
<input type="range" id="max-tokens" min="50" max="500" value="150" | |
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer"> | |
<div class="flex justify-between text-xs text-gray-500 mt-1"> | |
<span>50</span> | |
<span id="max-tokens-value">150</span> | |
<span>500</span> | |
</div> | |
</div> | |
</div> | |
<div class="space-y-4"> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-2">Top P</label> | |
<input type="range" id="top-p" min="0.1" max="1" step="0.1" value="1.0" | |
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer"> | |
<div class="flex justify-between text-xs text-gray-500 mt-1"> | |
<span>0.1</span> | |
<span id="top-p-value">1.0</span> | |
<span>1.0</span> | |
</div> | |
</div> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-2">Frequency Penalty</label> | |
<input type="range" id="frequency-penalty" min="0" max="2" step="0.1" value="0.0" | |
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer"> | |
<div class="flex justify-between text-xs text-gray-500 mt-1"> | |
<span>0.0</span> | |
<span id="frequency-penalty-value">0.0</span> | |
<span>2.0</span> | |
</div> | |
</div> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-2">Presence Penalty</label> | |
<input type="range" id="presence-penalty" min="0" max="2" step="0.1" value="0.0" | |
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer"> | |
<div class="flex justify-between text-xs text-gray-500 mt-1"> | |
<span>0.0</span> | |
<span id="presence-penalty-value">0.0</span> | |
<span>2.0</span> | |
</div> | |
</div> | |
</div> | |
</div> | |
<div class="mt-6 p-4 bg-blue-50 rounded-lg"> | |
<h4 class="font-medium text-blue-800 mb-2">💡 Configuration Tips</h4> | |
<ul class="text-sm text-blue-700 space-y-1"> | |
<li>• Lower temperature (0.0-0.3) for factual tasks, higher (0.7-1.0) for creative tasks</li> | |
<li>• Start with 10-20 samples for quick testing, use 50+ for reliable results</li> | |
<li>• Max tokens should match expected response length</li> | |
</ul> | |
</div> | |
</div> | |
</div> | |
<!-- Action Buttons --> | |
<div class="flex items-center justify-between pt-6 border-t border-gray-200"> | |
<div class="flex space-x-4"> | |
<button id="prev-tab" class="px-6 py-2 border border-gray-300 rounded-lg hover:bg-gray-50 transition-colors" onclick="previousTab()"> | |
Previous | |
</button> | |
<button id="next-tab" class="px-6 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors" onclick="nextTab()"> | |
Next | |
</button> | |
</div> | |
<button id="start-evaluation" class="hidden px-8 py-3 gradient-bg text-white rounded-lg font-semibold hover:opacity-90 transition-opacity" onclick="startEvaluation()"> | |
🚀 Start Evaluation | |
</button> | |
</div> | |
</div> | |
<!-- Evaluation Progress --> | |
<div id="evaluation-section" class="hidden"> | |
<div class="bg-white rounded-xl shadow-lg p-8 mb-8"> | |
<div class="flex items-center justify-between mb-6"> | |
<h2 class="text-2xl font-bold text-gray-800">📊 Evaluation Progress</h2> | |
<div id="evaluation-status" class="px-4 py-2 bg-blue-100 text-blue-800 rounded-lg font-medium"> | |
Initializing... | |
</div> | |
</div> | |
<div class="mb-6"> | |
<div class="flex justify-between text-sm text-gray-600 mb-2"> | |
<span>Progress</span> | |
<span id="progress-percentage">0%</span> | |
</div> | |
<div class="w-full bg-gray-200 rounded-full h-3"> | |
<div id="progress-bar" class="progress-bar bg-gradient-to-r from-blue-500 to-purple-600 h-3 rounded-full" style="width: 0%"></div> | |
</div> | |
</div> | |
<div class="grid md:grid-cols-2 gap-6"> | |
<div> | |
<h3 class="text-lg font-semibold mb-4">📝 Live Evaluation Logs</h3> | |
<div id="evaluation-logs" class="bg-gray-900 text-green-400 p-4 rounded-lg h-64 overflow-y-auto font-mono text-sm"> | |
<!-- Logs will appear here --> | |
</div> | |
</div> | |
<div> | |
<h3 class="text-lg font-semibold mb-4">🔍 Request/Response Details</h3> | |
<div id="request-response-logs" class="bg-gray-50 p-4 rounded-lg h-64 overflow-y-auto text-sm"> | |
<!-- Request/response details will appear here --> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<!-- Results Section --> | |
<div id="results-section" class="hidden"> | |
<div class="bg-white rounded-xl shadow-lg p-8"> | |
<div class="flex items-center justify-between mb-6"> | |
<h2 class="text-2xl font-bold text-gray-800">🎉 Evaluation Results</h2> | |
<div class="flex space-x-2"> | |
<button onclick="exportResults('json')" class="px-4 py-2 bg-green-600 text-white rounded-lg hover:bg-green-700 transition-colors"> | |
Export JSON | |
</button> | |
<button onclick="exportResults('csv')" class="px-4 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors"> | |
Export CSV | |
</button> | |
</div> | |
</div> | |
<div id="results-content"> | |
<!-- Results will be populated by JavaScript --> | |
</div> | |
</div> | |
</div> | |
</div> | |
<script> | |
// Global state | |
let selectedModels = []; | |
let selectedDataset = null; | |
let selectedDatasetSubset = null; | |
let selectedMetrics = []; | |
let currentTab = 'models'; | |
let evaluationResults = null; | |
let websocket = null; | |
// Data from backend | |
const MODELS = {json.dumps(SUPPORTED_MODELS)}; | |
const DATASETS = {json.dumps(SUPPORTED_DATASETS)}; | |
const METRICS = {json.dumps(SUPPORTED_METRICS)}; | |
// Initialize the application | |
document.addEventListener('DOMContentLoaded', function() {{ | |
populateModels(); | |
populateDatasets(); | |
populateMetrics(); | |
setupSliders(); | |
setupSearch(); | |
}}); | |
function populateModels() {{ | |
const grid = document.getElementById('models-grid'); | |
grid.innerHTML = ''; | |
Object.entries(MODELS).forEach(([provider, models]) => {{ | |
Object.entries(models).forEach(([modelId, model]) => {{ | |
const card = document.createElement('div'); | |
card.className = 'model-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4'; | |
card.dataset.provider = provider; | |
card.dataset.modelId = modelId; | |
card.innerHTML = ` | |
<div class="flex items-start justify-between mb-2"> | |
<h4 class="font-semibold text-gray-800">${{model.name}}</h4> | |
<span class="text-xs bg-gray-200 text-gray-700 px-2 py-1 rounded">${{model.provider}}</span> | |
</div> | |
<p class="text-sm text-gray-600 mb-3">${{model.description}}</p> | |
<div class="space-y-1 text-xs text-gray-500"> | |
<div>Context: ${{model.context_length}} tokens</div> | |
<div>Cost: $$${{model.cost_per_1k_tokens}}/1K tokens</div> | |
<div class="flex flex-wrap gap-1 mt-2"> | |
${{model.capabilities.map(cap => `<span class="bg-blue-100 text-blue-700 px-2 py-1 rounded">${{cap}}</span>`).join('')}} | |
</div> | |
</div> | |
`; | |
card.addEventListener('click', () => toggleModel(modelId, model)); | |
grid.appendChild(card); | |
}}); | |
}}); | |
}} | |
function populateDatasets() {{ | |
const grid = document.getElementById('datasets-grid'); | |
grid.innerHTML = ''; | |
Object.entries(DATASETS).forEach(([datasetId, dataset]) => {{ | |
const card = document.createElement('div'); | |
card.className = 'dataset-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4'; | |
card.dataset.datasetId = datasetId; | |
card.innerHTML = ` | |
<div class="flex items-start justify-between mb-2"> | |
<h4 class="font-semibold text-gray-800">${{dataset.name}}</h4> | |
<span class="text-xs bg-gray-200 text-gray-700 px-2 py-1 rounded">${{dataset.type}}</span> | |
</div> | |
<p class="text-sm text-gray-600 mb-3">${{dataset.description}}</p> | |
<div class="text-xs text-gray-500"> | |
<div>Samples: ${{dataset.sample_count}}</div> | |
<div>Subsets: ${{Array.isArray(dataset.subsets) ? dataset.subsets.length : 'N/A'}}</div> | |
</div> | |
`; | |
card.addEventListener('click', () => selectDataset(datasetId, dataset)); | |
grid.appendChild(card); | |
}}); | |
}} | |
function populateMetrics() {{ | |
const grid = document.getElementById('metrics-grid'); | |
grid.innerHTML = ''; | |
Object.entries(METRICS).forEach(([metricId, metric]) => {{ | |
const card = document.createElement('div'); | |
card.className = 'metric-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4'; | |
card.dataset.metricId = metricId; | |
card.innerHTML = ` | |
<div class="flex items-start justify-between mb-2"> | |
<h4 class="font-semibold text-gray-800">${{metric.name}}</h4> | |
<span class="text-xs bg-gray-200 text-gray-700 px-2 py-1 rounded">${{metric.category}}</span> | |
</div> | |
<p class="text-sm text-gray-600 mb-3">${{metric.description}}</p> | |
<div class="text-xs text-gray-500"> | |
Best for: ${{metric.best_for.join(', ')}} | |
</div> | |
`; | |
card.addEventListener('click', () => toggleMetric(metricId, metric)); | |
grid.appendChild(card); | |
}}); | |
}} | |
function setupSliders() {{ | |
const sliders = ['num-samples', 'temperature', 'max-tokens', 'top-p', 'frequency-penalty', 'presence-penalty']; | |
sliders.forEach(sliderId => {{ | |
const slider = document.getElementById(sliderId); | |
const valueDisplay = document.getElementById(sliderId + '-value'); | |
slider.addEventListener('input', function() {{ | |
valueDisplay.textContent = this.value; | |
}}); | |
}}); | |
}} | |
function setupSearch() {{ | |
const searchInput = document.getElementById('model-search'); | |
const providerFilter = document.getElementById('provider-filter'); | |
function filterModels() {{ | |
const searchTerm = searchInput.value.toLowerCase(); | |
const selectedProvider = providerFilter.value; | |
const modelCards = document.querySelectorAll('.model-card'); | |
modelCards.forEach(card => {{ | |
const modelName = card.querySelector('h4').textContent.toLowerCase(); | |
const provider = card.dataset.provider; | |
const matchesSearch = modelName.includes(searchTerm); | |
const matchesProvider = !selectedProvider || provider === selectedProvider; | |
card.style.display = matchesSearch && matchesProvider ? 'block' : 'none'; | |
}}); | |
}} | |
searchInput.addEventListener('input', filterModels); | |
providerFilter.addEventListener('change', filterModels); | |
}} | |
function toggleModel(modelId, model) {{ | |
const index = selectedModels.findIndex(m => m.id === modelId); | |
if (index > -1) {{ | |
selectedModels.splice(index, 1); | |
}} else if (selectedModels.length < 5) {{ | |
selectedModels.push({{id: modelId, ...model}}); | |
}} else {{ | |
alert('Maximum 5 models can be selected'); | |
return; | |
}} | |
updateModelSelection(); | |
}} | |
function updateModelSelection() {{ | |
// Update visual selection | |
document.querySelectorAll('.model-card').forEach(card => {{ | |
const modelId = card.dataset.modelId; | |
if (selectedModels.some(m => m.id === modelId)) {{ | |
card.classList.add('selected'); | |
}} else {{ | |
card.classList.remove('selected'); | |
}} | |
}}); | |
// Update selected models list | |
const list = document.getElementById('selected-models-list'); | |
list.innerHTML = selectedModels.map(model => | |
`<span class="bg-green-100 text-green-800 px-3 py-1 rounded-full text-sm">${{model.name}}</span>` | |
).join(''); | |
}} | |
function selectDataset(datasetId, dataset) {{ | |
selectedDataset = datasetId; | |
// Update visual selection | |
document.querySelectorAll('.dataset-card').forEach(card => {{ | |
if (card.dataset.datasetId === datasetId) {{ | |
card.classList.add('selected'); | |
}} else {{ | |
card.classList.remove('selected'); | |
}} | |
}}); | |
// Show subsets if available | |
if (dataset.subsets && dataset.subsets.length > 0) {{ | |
const subsetsDiv = document.getElementById('dataset-subsets'); | |
const select = document.getElementById('subset-select'); | |
select.innerHTML = '<option value="">All subsets</option>' + | |
dataset.subsets.map(subset => `<option value="${{subset}}">${{subset}}</option>`).join(''); | |
subsetsDiv.classList.remove('hidden'); | |
select.addEventListener('change', function() {{ | |
selectedDatasetSubset = this.value || null; | |
}}); | |
}} else {{ | |
document.getElementById('dataset-subsets').classList.add('hidden'); | |
selectedDatasetSubset = null; | |
}} | |
}} | |
function toggleMetric(metricId, metric) {{ | |
const index = selectedMetrics.findIndex(m => m.id === metricId); | |
if (index > -1) {{ | |
selectedMetrics.splice(index, 1); | |
}} else {{ | |
selectedMetrics.push({{id: metricId, ...metric}}); | |
}} | |
updateMetricSelection(); | |
}} | |
function updateMetricSelection() {{ | |
// Update visual selection | |
document.querySelectorAll('.metric-card').forEach(card => {{ | |
const metricId = card.dataset.metricId; | |
if (selectedMetrics.some(m => m.id === metricId)) {{ | |
card.classList.add('selected'); | |
}} else {{ | |
card.classList.remove('selected'); | |
}} | |
}}); | |
// Update selected metrics list | |
const list = document.getElementById('selected-metrics-list'); | |
list.innerHTML = selectedMetrics.map(metric => | |
`<span class="bg-purple-100 text-purple-800 px-3 py-1 rounded-full text-sm">${{metric.name}}</span>` | |
).join(''); | |
}} | |
function switchTab(tabName) {{ | |
currentTab = tabName; | |
// Hide all tabs | |
document.querySelectorAll('.tab-content').forEach(tab => {{ | |
tab.classList.add('hidden'); | |
}}); | |
// Show selected tab | |
document.getElementById(tabName + '-tab').classList.remove('hidden'); | |
// Update tab buttons | |
document.querySelectorAll('.tab-button').forEach(btn => {{ | |
btn.classList.remove('active'); | |
}}); | |
event.target.classList.add('active'); | |
updateNavigationButtons(); | |
}} | |
function nextTab() {{ | |
const tabs = ['models', 'datasets', 'metrics', 'config']; | |
const currentIndex = tabs.indexOf(currentTab); | |
if (currentIndex < tabs.length - 1) {{ | |
const nextTab = tabs[currentIndex + 1]; | |
document.querySelector(`[onclick="switchTab('${{nextTab}}')"]`).click(); | |
}} | |
}} | |
function previousTab() {{ | |
const tabs = ['models', 'datasets', 'metrics', 'config']; | |
const currentIndex = tabs.indexOf(currentTab); | |
if (currentIndex > 0) {{ | |
const prevTab = tabs[currentIndex - 1]; | |
document.querySelector(`[onclick="switchTab('${{prevTab}}')"]`).click(); | |
}} | |
}} | |
function updateNavigationButtons() {{ | |
const prevBtn = document.getElementById('prev-tab'); | |
const nextBtn = document.getElementById('next-tab'); | |
const startBtn = document.getElementById('start-evaluation'); | |
prevBtn.style.display = currentTab === 'models' ? 'none' : 'block'; | |
if (currentTab === 'config') {{ | |
nextBtn.classList.add('hidden'); | |
startBtn.classList.remove('hidden'); | |
}} else {{ | |
nextBtn.classList.remove('hidden'); | |
startBtn.classList.add('hidden'); | |
}} | |
}} | |
function startEvaluation() {{ | |
// Validate selections | |
if (selectedModels.length === 0) {{ | |
alert('Please select at least one model'); | |
return; | |
}} | |
if (!selectedDataset) {{ | |
alert('Please select a dataset'); | |
return; | |
}} | |
if (selectedMetrics.length === 0) {{ | |
alert('Please select at least one metric'); | |
return; | |
}} | |
// Prepare evaluation request | |
const request = {{ | |
models: selectedModels.map(m => m.id), | |
dataset: selectedDataset, | |
dataset_subset: selectedDatasetSubset, | |
metrics: selectedMetrics.map(m => m.id), | |
num_samples: parseInt(document.getElementById('num-samples').value), | |
temperature: parseFloat(document.getElementById('temperature').value), | |
max_tokens: parseInt(document.getElementById('max-tokens').value), | |
top_p: parseFloat(document.getElementById('top-p').value), | |
frequency_penalty: parseFloat(document.getElementById('frequency-penalty').value), | |
presence_penalty: parseFloat(document.getElementById('presence-penalty').value) | |
}}; | |
// Show evaluation section | |
document.getElementById('evaluation-section').classList.remove('hidden'); | |
document.getElementById('results-section').classList.add('hidden'); | |
// Scroll to evaluation section | |
document.getElementById('evaluation-section').scrollIntoView({{ behavior: 'smooth' }}); | |
// Start evaluation | |
fetch('/api/evaluate', {{ | |
method: 'POST', | |
headers: {{ | |
'Content-Type': 'application/json' | |
}}, | |
body: JSON.stringify(request) | |
}}) | |
.then(response => response.json()) | |
.then(data => {{ | |
if (data.status === 'started') {{ | |
connectWebSocket(data.evaluation_id); | |
}} else {{ | |
alert('Failed to start evaluation: ' + data.message); | |
}} | |
}}) | |
.catch(error => {{ | |
console.error('Error:', error); | |
alert('Failed to start evaluation'); | |
}}); | |
}} | |
function connectWebSocket(evaluationId) {{ | |
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; | |
const wsUrl = `${{protocol}}//${{window.location.host}}/ws/${{evaluationId}}`; | |
websocket = new WebSocket(wsUrl); | |
websocket.onmessage = function(event) {{ | |
const data = JSON.parse(event.data); | |
handleWebSocketMessage(data); | |
}}; | |
websocket.onclose = function() {{ | |
console.log('WebSocket connection closed'); | |
}}; | |
websocket.onerror = function(error) {{ | |
console.error('WebSocket error:', error); | |
}}; | |
}} | |
function handleWebSocketMessage(data) {{ | |
const logsContainer = document.getElementById('evaluation-logs'); | |
const requestResponseContainer = document.getElementById('request-response-logs'); | |
switch (data.type) {{ | |
case 'evaluation_start': | |
case 'evaluation_progress': | |
case 'model_start': | |
case 'model_complete': | |
// Update progress | |
if (data.progress !== undefined) {{ | |
document.getElementById('progress-bar').style.width = data.progress + '%'; | |
document.getElementById('progress-percentage').textContent = data.progress + '%'; | |
}} | |
// Add log entry | |
const logEntry = document.createElement('div'); | |
logEntry.className = 'log-entry mb-1'; | |
logEntry.textContent = data.message; | |
logsContainer.appendChild(logEntry); | |
logsContainer.scrollTop = logsContainer.scrollHeight; | |
break; | |
case 'request_response': | |
// Add request/response details | |
const reqResEntry = document.createElement('div'); | |
reqResEntry.className = 'mb-4 p-3 bg-white rounded border'; | |
reqResEntry.innerHTML = ` | |
<div class="font-medium text-gray-800 mb-2">${{data.model}} - Sample ${{data.sample_index}}</div> | |
<div class="text-sm space-y-2"> | |
<div> | |
<span class="font-medium text-blue-600">Request:</span> | |
<pre class="text-xs bg-gray-100 p-2 rounded mt-1">${{JSON.stringify(data.request, null, 2)}}</pre> | |
</div> | |
<div> | |
<span class="font-medium text-green-600">Response:</span> | |
<pre class="text-xs bg-gray-100 p-2 rounded mt-1">${{JSON.stringify(data.response, null, 2)}}</pre> | |
</div> | |
</div> | |
`; | |
requestResponseContainer.appendChild(reqResEntry); | |
requestResponseContainer.scrollTop = requestResponseContainer.scrollHeight; | |
break; | |
case 'evaluation_complete': | |
// Update progress to 100% | |
document.getElementById('progress-bar').style.width = '100%'; | |
document.getElementById('progress-percentage').textContent = '100%'; | |
document.getElementById('evaluation-status').textContent = 'Completed'; | |
document.getElementById('evaluation-status').className = 'px-4 py-2 bg-green-100 text-green-800 rounded-lg font-medium'; | |
// Show results | |
evaluationResults = data; | |
displayResults(data.results, data.summary); | |
break; | |
case 'evaluation_error': | |
document.getElementById('evaluation-status').textContent = 'Failed'; | |
document.getElementById('evaluation-status').className = 'px-4 py-2 bg-red-100 text-red-800 rounded-lg font-medium'; | |
const errorEntry = document.createElement('div'); | |
errorEntry.className = 'log-entry mb-1 text-red-400'; | |
errorEntry.textContent = data.message; | |
logsContainer.appendChild(errorEntry); | |
break; | |
}} | |
}} | |
function displayResults(results, summary) {{ | |
document.getElementById('results-section').classList.remove('hidden'); | |
document.getElementById('results-section').scrollIntoView({{ behavior: 'smooth' }}); | |
const container = document.getElementById('results-content'); | |
// Summary section | |
const summaryHtml = ` | |
<div class="grid md:grid-cols-4 gap-4 mb-8"> | |
<div class="bg-blue-50 p-4 rounded-lg"> | |
<div class="text-2xl font-bold text-blue-600">${{summary.models_evaluated}}</div> | |
<div class="text-sm text-blue-800">Models Evaluated</div> | |
</div> | |
<div class="bg-green-50 p-4 rounded-lg"> | |
<div class="text-2xl font-bold text-green-600">${{summary.total_samples}}</div> | |
<div class="text-sm text-green-800">Total Samples</div> | |
</div> | |
<div class="bg-purple-50 p-4 rounded-lg"> | |
<div class="text-2xl font-bold text-purple-600">$${{summary.total_cost_usd}}</div> | |
<div class="text-sm text-purple-800">Total Cost</div> | |
</div> | |
<div class="bg-orange-50 p-4 rounded-lg"> | |
<div class="text-2xl font-bold text-orange-600">${{summary.avg_latency_ms}}ms</div> | |
<div class="text-sm text-orange-800">Avg Latency</div> | |
</div> | |
</div> | |
`; | |
// Results table | |
const modelsArray = Object.entries(results); | |
const metricsArray = Object.keys(modelsArray[0][1].metrics); | |
const tableHtml = ` | |
<div class="overflow-x-auto"> | |
<table class="w-full border-collapse border border-gray-300"> | |
<thead> | |
<tr class="bg-gray-50"> | |
<th class="border border-gray-300 px-4 py-2 text-left">Model</th> | |
${{metricsArray.map(metric => `<th class="border border-gray-300 px-4 py-2 text-center">${{METRICS[metric].name}}</th>`).join('')}} | |
<th class="border border-gray-300 px-4 py-2 text-center">Cost</th> | |
<th class="border border-gray-300 px-4 py-2 text-center">Latency</th> | |
</tr> | |
</thead> | |
<tbody> | |
${{modelsArray.map(([modelId, result]) => ` | |
<tr class="hover:bg-gray-50"> | |
<td class="border border-gray-300 px-4 py-2 font-medium">${{result.model_info.name}}</td> | |
${{metricsArray.map(metric => ` | |
<td class="border border-gray-300 px-4 py-2 text-center"> | |
<span class="font-mono">${{result.metrics[metric].score}}</span> | |
</td> | |
`).join('')}} | |
<td class="border border-gray-300 px-4 py-2 text-center">$${{result.total_cost.toFixed(4)}}</td> | |
<td class="border border-gray-300 px-4 py-2 text-center">${{result.avg_latency_ms}}ms</td> | |
</tr> | |
`).join('')}} | |
</tbody> | |
</table> | |
</div> | |
`; | |
container.innerHTML = summaryHtml + tableHtml; | |
}} | |
function exportResults(format) {{ | |
if (!evaluationResults) {{ | |
alert('No results to export'); | |
return; | |
}} | |
let content, filename, mimeType; | |
if (format === 'json') {{ | |
content = JSON.stringify(evaluationResults, null, 2); | |
filename = 'novaeval_results.json'; | |
mimeType = 'application/json'; | |
}} else if (format === 'csv') {{ | |
// Convert to CSV | |
const results = evaluationResults.results; | |
const headers = ['Model', 'Provider']; | |
const metricsArray = Object.keys(Object.values(results)[0].metrics); | |
headers.push(...metricsArray, 'Total Cost', 'Avg Latency (ms)'); | |
const rows = [headers]; | |
Object.entries(results).forEach(([modelId, result]) => {{ | |
const row = [ | |
result.model_info.name, | |
result.model_info.provider, | |
...metricsArray.map(metric => result.metrics[metric].score), | |
result.total_cost.toFixed(4), | |
result.avg_latency_ms | |
]; | |
rows.push(row); | |
}}); | |
content = rows.map(row => row.join(',')).join('\\n'); | |
filename = 'novaeval_results.csv'; | |
mimeType = 'text/csv'; | |
}} | |
const blob = new Blob([content], {{ type: mimeType }}); | |
const url = URL.createObjectURL(blob); | |
const a = document.createElement('a'); | |
a.href = url; | |
a.download = filename; | |
document.body.appendChild(a); | |
a.click(); | |
document.body.removeChild(a); | |
URL.revokeObjectURL(url); | |
}} | |
</script> | |
</body> | |
</html> | |
""") | |
async def get_models(): | |
"""Get all supported models""" | |
return SUPPORTED_MODELS | |
async def get_datasets(): | |
"""Get all supported datasets""" | |
return SUPPORTED_DATASETS | |
async def get_metrics(): | |
"""Get all supported metrics""" | |
return SUPPORTED_METRICS | |
async def start_evaluation(request: EvaluationRequest): | |
"""Start a new evaluation""" | |
evaluation_id = str(uuid.uuid4()) | |
# Store evaluation info | |
active_evaluations[evaluation_id] = { | |
"id": evaluation_id, | |
"request": request.dict(), | |
"status": "starting", | |
"created_at": datetime.now().isoformat() | |
} | |
# Start evaluation in background | |
asyncio.create_task(simulate_novaeval_evaluation(evaluation_id, request)) | |
logger.info(f"Started evaluation {evaluation_id} with {len(request.models)} models") | |
return EvaluationResponse( | |
evaluation_id=evaluation_id, | |
status="started", | |
message="Evaluation started successfully" | |
) | |
async def get_evaluation(evaluation_id: str): | |
"""Get evaluation status and results""" | |
if evaluation_id not in active_evaluations: | |
raise HTTPException(status_code=404, detail="Evaluation not found") | |
return active_evaluations[evaluation_id] | |
async def websocket_endpoint(websocket: WebSocket, evaluation_id: str): | |
"""WebSocket endpoint for real-time evaluation updates""" | |
await websocket.accept() | |
websocket_connections.append(websocket) | |
try: | |
while True: | |
# Keep connection alive | |
await asyncio.sleep(1) | |
except WebSocketDisconnect: | |
websocket_connections.remove(websocket) | |
async def health_check(): | |
"""Health check endpoint""" | |
return { | |
"status": "healthy", | |
"timestamp": datetime.now().isoformat(), | |
"version": "1.0.0", | |
"active_evaluations": len(active_evaluations) | |
} | |
if __name__ == "__main__": | |
port = int(os.getenv("PORT", 7860)) | |
logger.info(f"Starting Comprehensive NovaEval Space on port {port}") | |
uvicorn.run(app, host="0.0.0.0", port=port, reload=False) | |