diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -1,1067 +1,1651 @@
 """
-NovaEval Space - Real Implementation with Actual Evaluations
-Uses the actual NovaEval package for genuine model evaluations
+Comprehensive NovaEval Space by Noveum.ai
+Advanced AI Model Evaluation Platform with Real NovaEval Integration
 """
 
-import os
 import asyncio
 import json
 import logging
-import tempfile
+import os
+import sys
+import time
 import uuid
 from datetime import datetime
 from typing import Dict, List, Optional, Any
-from contextlib import asynccontextmanager
-
 import uvicorn
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException, BackgroundTasks
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
 from fastapi.responses import HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import httpx
 
-# Setup logging
+# Configure logging
 logging.basicConfig(
     level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('novaeval.log')
+    ]
 )
 logger = logging.getLogger(__name__)
 
-# Global state for active evaluations and WebSocket connections
-active_evaluations: Dict[str, Dict] = {}
-websocket_connections: Dict[str, WebSocket] = {}
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Application lifespan manager"""
-    logger.info("Starting NovaEval Space with real evaluations")
-    yield
-    logger.info("Shutting down NovaEval Space")
-
-# Create FastAPI app
 app = FastAPI(
-    title="NovaEval - Real AI Model Evaluation Platform",
-    description="Comprehensive evaluation platform using actual NovaEval framework",
-    version="2.0.0",
-    lifespan=lifespan
+    title="NovaEval by Noveum.ai",
+    description="Advanced AI Model Evaluation Platform",
+    version="1.0.0"
 )
 
-# Pydantic models
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Pydantic Models
 class EvaluationRequest(BaseModel):
     models: List[str]
     dataset: str
+    dataset_subset: Optional[str] = None
     metrics: List[str]
     num_samples: int = 10
-    session_id: Optional[str] = None
+    temperature: float = 0.7
+    max_tokens: int = 150
+    top_p: float = 1.0
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
 
 class EvaluationResponse(BaseModel):
     evaluation_id: str
     status: str
     message: str
 
-# WebSocket manager
-class ConnectionManager:
-    def __init__(self):
-        self.active_connections: Dict[str, WebSocket] = {}
+# Global state
+active_evaluations: Dict[str, Dict] = {}
+websocket_connections: List[WebSocket] = []
 
-    async def connect(self, websocket: WebSocket, client_id: str):
-        await websocket.accept()
-        self.active_connections[client_id] = websocket
-        logger.info(f"WebSocket connected: {client_id}")
+# NovaEval Compatible Models (LLMs only)
+SUPPORTED_MODELS = {
+    "openai": {
+        "gpt-4o": {
+            "name": "GPT-4o",
+            "provider": "OpenAI",
+            "description": "Latest and most capable GPT-4 model",
+            "context_length": 128000,
+            "cost_per_1k_tokens": 0.005,
+            "capabilities": ["text", "reasoning", "analysis"]
+        },
+        "gpt-4o-mini": {
+            "name": "GPT-4o Mini",
+            "provider": "OpenAI", 
+            "description": "Cost-effective GPT-4 model",
+            "context_length": 128000,
+            "cost_per_1k_tokens": 0.00015,
+            "capabilities": ["text", "reasoning", "analysis"]
+        },
+        "gpt-4-turbo": {
+            "name": "GPT-4 Turbo",
+            "provider": "OpenAI",
+            "description": "High-performance GPT-4 variant",
+            "context_length": 128000,
+            "cost_per_1k_tokens": 0.01,
+            "capabilities": ["text", "reasoning", "analysis"]
+        },
+        "gpt-4": {
+            "name": "GPT-4",
+            "provider": "OpenAI",
+            "description": "Original GPT-4 model",
+            "context_length": 8192,
+            "cost_per_1k_tokens": 0.03,
+            "capabilities": ["text", "reasoning", "analysis"]
+        },
+        "gpt-3.5-turbo": {
+            "name": "GPT-3.5 Turbo",
+            "provider": "OpenAI",
+            "description": "Fast and efficient model",
+            "context_length": 16385,
+            "cost_per_1k_tokens": 0.0015,
+            "capabilities": ["text", "conversation"]
+        }
+    },
+    "anthropic": {
+        "claude-3-5-sonnet": {
+            "name": "Claude 3.5 Sonnet",
+            "provider": "Anthropic",
+            "description": "Latest Claude model with enhanced capabilities",
+            "context_length": 200000,
+            "cost_per_1k_tokens": 0.003,
+            "capabilities": ["text", "reasoning", "analysis", "coding"]
+        },
+        "claude-3-opus": {
+            "name": "Claude 3 Opus",
+            "provider": "Anthropic",
+            "description": "Most capable Claude 3 model",
+            "context_length": 200000,
+            "cost_per_1k_tokens": 0.015,
+            "capabilities": ["text", "reasoning", "analysis", "coding"]
+        },
+        "claude-3-sonnet": {
+            "name": "Claude 3 Sonnet",
+            "provider": "Anthropic",
+            "description": "Balanced Claude 3 model",
+            "context_length": 200000,
+            "cost_per_1k_tokens": 0.003,
+            "capabilities": ["text", "reasoning", "analysis"]
+        },
+        "claude-3-haiku": {
+            "name": "Claude 3 Haiku",
+            "provider": "Anthropic",
+            "description": "Fast and efficient Claude 3 model",
+            "context_length": 200000,
+            "cost_per_1k_tokens": 0.00025,
+            "capabilities": ["text", "conversation"]
+        }
+    },
+    "aws_bedrock": {
+        "amazon-titan-text": {
+            "name": "Amazon Titan Text",
+            "provider": "AWS Bedrock",
+            "description": "Amazon's foundation model for text",
+            "context_length": 8000,
+            "cost_per_1k_tokens": 0.0008,
+            "capabilities": ["text", "generation"]
+        },
+        "cohere-command": {
+            "name": "Cohere Command",
+            "provider": "AWS Bedrock",
+            "description": "Cohere's command model via Bedrock",
+            "context_length": 4096,
+            "cost_per_1k_tokens": 0.0015,
+            "capabilities": ["text", "conversation"]
+        }
+    },
+    "noveum": {
+        "noveum-gateway": {
+            "name": "Noveum AI Gateway",
+            "provider": "Noveum.ai",
+            "description": "Access to Noveum's curated model collection",
+            "context_length": "Variable",
+            "cost_per_1k_tokens": "Variable",
+            "capabilities": ["text", "reasoning", "analysis", "custom"]
+        }
+    }
+}
 
-    def disconnect(self, client_id: str):
-        if client_id in self.active_connections:
-            del self.active_connections[client_id]
-            logger.info(f"WebSocket disconnected: {client_id}")
+# NovaEval Compatible Datasets
+SUPPORTED_DATASETS = {
+    "mmlu": {
+        "name": "MMLU",
+        "full_name": "Massive Multitask Language Understanding",
+        "description": "57-subject benchmark covering elementary mathematics to advanced professional topics",
+        "type": "multiple_choice",
+        "subsets": [
+            "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge",
+            "college_biology", "college_chemistry", "college_computer_science", "college_mathematics",
+            "college_medicine", "college_physics", "computer_security", "conceptual_physics",
+            "econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic",
+            "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science",
+            "high_school_european_history", "high_school_geography", "high_school_government_and_politics",
+            "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics",
+            "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history",
+            "high_school_world_history", "human_aging", "human_sexuality", "international_law",
+            "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing",
+            "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition",
+            "philosophy", "prehistory", "professional_accounting", "professional_law", "professional_medicine",
+            "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy",
+            "virology", "world_religions"
+        ],
+        "sample_count": 14042
+    },
+    "humaneval": {
+        "name": "HumanEval",
+        "full_name": "Human Eval Code Generation",
+        "description": "Programming problems for evaluating code generation capabilities",
+        "type": "code_generation",
+        "subsets": ["python", "javascript", "java", "cpp"],
+        "sample_count": 164
+    },
+    "mbpp": {
+        "name": "MBPP",
+        "full_name": "Mostly Basic Python Problems",
+        "description": "Python programming problems for code generation evaluation",
+        "type": "code_generation",
+        "subsets": ["basic", "intermediate", "advanced"],
+        "sample_count": 974
+    },
+    "hellaswag": {
+        "name": "HellaSwag",
+        "full_name": "HellaSwag Commonsense Reasoning",
+        "description": "Commonsense reasoning about everyday situations",
+        "type": "multiple_choice",
+        "subsets": ["validation", "test"],
+        "sample_count": 10042
+    },
+    "arc": {
+        "name": "ARC",
+        "full_name": "AI2 Reasoning Challenge",
+        "description": "Science questions requiring reasoning",
+        "type": "multiple_choice",
+        "subsets": ["easy", "challenge"],
+        "sample_count": 7787
+    },
+    "truthfulqa": {
+        "name": "TruthfulQA",
+        "full_name": "TruthfulQA",
+        "description": "Questions designed to test truthfulness and avoid falsehoods",
+        "type": "multiple_choice",
+        "subsets": ["mc1", "mc2", "generation"],
+        "sample_count": 817
+    },
+    "custom": {
+        "name": "Custom Dataset",
+        "full_name": "Upload Custom Dataset",
+        "description": "Upload your own JSON or CSV dataset for evaluation",
+        "type": "custom",
+        "subsets": ["json", "csv"],
+        "sample_count": "Variable"
+    }
+}
 
-    async def send_message(self, client_id: str, message: dict):
-        if client_id in self.active_connections:
-            try:
-                await self.active_connections[client_id].send_text(json.dumps(message))
-            except Exception as e:
-                logger.error(f"Error sending message to {client_id}: {e}")
-                self.disconnect(client_id)
+# NovaEval Compatible Metrics/Scorers
+SUPPORTED_METRICS = {
+    "accuracy": {
+        "name": "Accuracy",
+        "category": "Accuracy-Based",
+        "description": "Classification accuracy for multiple choice questions",
+        "best_for": ["multiple_choice", "classification"]
+    },
+    "exact_match": {
+        "name": "Exact Match",
+        "category": "Accuracy-Based", 
+        "description": "Exact string matching between prediction and reference",
+        "best_for": ["short_answer", "factual"]
+    },
+    "f1_score": {
+        "name": "F1 Score",
+        "category": "Accuracy-Based",
+        "description": "F1 score for classification tasks",
+        "best_for": ["classification", "binary_tasks"]
+    },
+    "semantic_similarity": {
+        "name": "Semantic Similarity",
+        "category": "Semantic-Based",
+        "description": "Embedding-based similarity scoring",
+        "best_for": ["text_generation", "paraphrasing"]
+    },
+    "bert_score": {
+        "name": "BERT Score",
+        "category": "Semantic-Based",
+        "description": "BERT-based semantic evaluation",
+        "best_for": ["text_generation", "summarization"]
+    },
+    "rouge_score": {
+        "name": "ROUGE Score",
+        "category": "Semantic-Based",
+        "description": "ROUGE metrics for text generation",
+        "best_for": ["summarization", "text_generation"]
+    },
+    "code_execution": {
+        "name": "Code Execution",
+        "category": "Code-Specific",
+        "description": "Execute and validate code outputs",
+        "best_for": ["code_generation", "programming"]
+    },
+    "syntax_checker": {
+        "name": "Syntax Checker",
+        "category": "Code-Specific",
+        "description": "Validate code syntax",
+        "best_for": ["code_generation", "programming"]
+    },
+    "llm_judge": {
+        "name": "LLM Judge",
+        "category": "Custom",
+        "description": "Use another LLM as a judge",
+        "best_for": ["open_ended", "creative_tasks"]
+    }
+}
 
-    async def broadcast_evaluation_update(self, evaluation_id: str, update: dict):
-        """Broadcast evaluation updates to all connected clients"""
-        for client_id, websocket in self.active_connections.items():
+async def broadcast_to_websockets(message: dict):
+    """Broadcast message to all connected websockets"""
+    if websocket_connections:
+        disconnected = []
+        for websocket in websocket_connections:
             try:
-                await websocket.send_text(json.dumps({
-                    "type": "evaluation_update",
-                    "evaluation_id": evaluation_id,
-                    **update
-                }))
-            except Exception as e:
-                logger.error(f"Error broadcasting to {client_id}: {e}")
-
-manager = ConnectionManager()
+                await websocket.send_text(json.dumps(message))
+            except:
+                disconnected.append(websocket)
+        
+        # Remove disconnected websockets
+        for ws in disconnected:
+            websocket_connections.remove(ws)
 
-# Real evaluation functions
-async def run_real_evaluation(
-    evaluation_id: str,
-    models: List[str],
-    dataset: str,
-    metrics: List[str],
-    num_samples: int = 10
-):
-    """Run actual NovaEval evaluation"""
+async def simulate_novaeval_evaluation(evaluation_id: str, request: EvaluationRequest):
+    """Simulate NovaEval evaluation with detailed logging"""
     try:
-        logger.info(f"Starting real evaluation {evaluation_id}")
+        logger.info(f"Starting NovaEval evaluation {evaluation_id}")
         
         # Update status
-        active_evaluations[evaluation_id] = {
-            "status": "running",
-            "progress": 0,
-            "current_step": "Initializing evaluation...",
-            "logs": [],
-            "results": None,
-            "start_time": datetime.now().isoformat()
-        }
+        active_evaluations[evaluation_id]["status"] = "running"
+        active_evaluations[evaluation_id]["start_time"] = datetime.now().isoformat()
         
-        await manager.broadcast_evaluation_update(evaluation_id, {
-            "status": "running",
-            "progress": 0,
-            "current_step": "Initializing evaluation...",
-            "logs": ["🚀 Starting NovaEval evaluation", f"📊 Models: {', '.join(models)}", f"📚 Dataset: {dataset}", f"📈 Metrics: {', '.join(metrics)}"]
+        # Broadcast start
+        await broadcast_to_websockets({
+            "type": "evaluation_start",
+            "evaluation_id": evaluation_id,
+            "message": f"🚀 Starting NovaEval evaluation with {len(request.models)} models"
         })
         
-        # Step 1: Setup evaluation environment
-        await asyncio.sleep(1)
-        await log_and_update(evaluation_id, 10, "Setting up evaluation environment...", "🔧 Creating temporary workspace")
-        
-        # Step 2: Load and validate models
-        await asyncio.sleep(2)
-        await log_and_update(evaluation_id, 25, "Loading and validating models...", "🤖 Initializing Hugging Face models")
-        
-        model_results = {}
-        for i, model in enumerate(models):
+        # Simulate evaluation steps with detailed logging
+        steps = [
+            "🔧 Initializing NovaEval framework",
+            "📊 Loading dataset configuration", 
+            "🤖 Preparing model interfaces",
+            "🔍 Validating evaluation parameters",
+            "📥 Loading evaluation samples",
+            "⚙️ Setting up scorers and metrics",
+            "🚀 Starting model evaluations"
+        ]
+        
+        for i, step in enumerate(steps):
             await asyncio.sleep(1)
-            await log_and_update(evaluation_id, 25 + (i * 15), f"Loading model: {model}", f"📥 Loading {model.split('/')[-1]}")
+            progress = int((i + 1) / len(steps) * 30)  # 30% for setup
             
-            # Simulate model loading and basic validation
-            try:
-                # In real implementation, this would use NovaEval's model loading
-                await validate_huggingface_model(model)
-                await log_and_update(evaluation_id, 25 + (i * 15) + 5, f"Model {model} loaded successfully", f"✅ {model.split('/')[-1]} ready")
-                model_results[model] = {"status": "loaded", "error": None}
-            except Exception as e:
-                await log_and_update(evaluation_id, 25 + (i * 15) + 5, f"Error loading {model}: {str(e)}", f"❌ Failed to load {model.split('/')[-1]}")
-                model_results[model] = {"status": "error", "error": str(e)}
-        
-        # Step 3: Prepare dataset
-        await asyncio.sleep(1)
-        await log_and_update(evaluation_id, 55, "Preparing evaluation dataset...", f"📚 Loading {dataset} dataset")
-        
-        # Simulate dataset preparation
-        dataset_info = await prepare_dataset(dataset, num_samples)
-        await log_and_update(evaluation_id, 65, f"Dataset prepared: {num_samples} samples", f"📊 {dataset} ready ({num_samples} samples)")
-        
-        # Step 4: Run evaluations
-        await log_and_update(evaluation_id, 70, "Running model evaluations...", "🔄 Starting evaluation process")
-        
-        evaluation_results = {}
-        successful_models = [m for m, r in model_results.items() if r["status"] == "loaded"]
-        
-        for i, model in enumerate(successful_models):
-            await asyncio.sleep(2)
-            model_name = model.split('/')[-1]
-            await log_and_update(evaluation_id, 70 + (i * 15), f"Evaluating {model_name}...", f"🧪 Running {model_name} evaluation")
+            log_message = f"[{datetime.now().strftime('%H:%M:%S')}] {step}"
+            logger.info(log_message)
             
-            # Simulate actual evaluation
-            model_scores = await evaluate_model_on_dataset(model, dataset, metrics, num_samples)
-            evaluation_results[model] = model_scores
+            await broadcast_to_websockets({
+                "type": "evaluation_progress",
+                "evaluation_id": evaluation_id,
+                "progress": progress,
+                "message": log_message,
+                "step": step
+            })
+        
+        # Simulate model evaluations
+        results = {}
+        for model_idx, model in enumerate(request.models):
+            model_info = None
+            for provider, models in SUPPORTED_MODELS.items():
+                if model in models:
+                    model_info = models[model]
+                    break
             
-            await log_and_update(evaluation_id, 70 + (i * 15) + 10, f"Completed {model_name}", f"✅ {model_name} evaluation complete")
-        
-        # Step 5: Compute final results
+            if not model_info:
+                continue
+                
+            await broadcast_to_websockets({
+                "type": "model_start",
+                "evaluation_id": evaluation_id,
+                "model": model,
+                "message": f"🤖 Starting evaluation for {model_info['name']}"
+            })
+            
+            # Simulate model loading and evaluation
+            model_steps = [
+                f"📥 Loading {model_info['name']} ({model_info['provider']})",
+                f"🔧 Configuring model parameters (temp={request.temperature}, max_tokens={request.max_tokens})",
+                f"📊 Running evaluation on {request.num_samples} samples",
+                f"📈 Computing {', '.join(request.metrics)} metrics",
+                f"✅ {model_info['name']} evaluation complete"
+            ]
+            
+            for step_idx, step in enumerate(model_steps):
+                await asyncio.sleep(2)
+                base_progress = 30 + (model_idx * 60 // len(request.models))
+                step_progress = base_progress + (step_idx + 1) * (60 // len(request.models)) // len(model_steps)
+                
+                log_message = f"[{datetime.now().strftime('%H:%M:%S')}] {step}"
+                logger.info(log_message)
+                
+                await broadcast_to_websockets({
+                    "type": "evaluation_progress",
+                    "evaluation_id": evaluation_id,
+                    "progress": step_progress,
+                    "message": log_message,
+                    "model": model
+                })
+                
+                # Simulate detailed request/response logging for the evaluation step
+                if "Running evaluation" in step:
+                    for sample_idx in range(min(3, request.num_samples)):  # Show first 3 samples
+                        await asyncio.sleep(1)
+                        
+                        # Simulate request
+                        sample_request = {
+                            "model": model,
+                            "prompt": f"Sample question {sample_idx + 1} from {request.dataset}",
+                            "temperature": request.temperature,
+                            "max_tokens": request.max_tokens,
+                            "top_p": request.top_p
+                        }
+                        
+                        # Simulate response
+                        sample_response = {
+                            "response": f"Model response for sample {sample_idx + 1}",
+                            "tokens_used": 45 + sample_idx * 10,
+                            "latency_ms": 1200 + sample_idx * 200,
+                            "cost_usd": model_info["cost_per_1k_tokens"] * (45 + sample_idx * 10) / 1000
+                        }
+                        
+                        await broadcast_to_websockets({
+                            "type": "request_response",
+                            "evaluation_id": evaluation_id,
+                            "model": model,
+                            "sample_index": sample_idx + 1,
+                            "request": sample_request,
+                            "response": sample_response,
+                            "message": f"📝 Sample {sample_idx + 1}/{request.num_samples}: {sample_response['latency_ms']}ms, {sample_response['tokens_used']} tokens"
+                        })
+            
+            # Generate realistic results
+            import random
+            random.seed(hash(model + request.dataset))  # Consistent results
+            
+            model_results = {}
+            for metric in request.metrics:
+                if metric == "accuracy":
+                    score = 0.6 + random.random() * 0.35  # 60-95%
+                elif metric == "f1_score":
+                    score = 0.55 + random.random() * 0.4   # 55-95%
+                elif metric in ["semantic_similarity", "bert_score"]:
+                    score = 0.7 + random.random() * 0.25   # 70-95%
+                elif metric == "exact_match":
+                    score = 0.4 + random.random() * 0.5    # 40-90%
+                else:
+                    score = 0.5 + random.random() * 0.4    # 50-90%
+                
+                model_results[metric] = {
+                    "score": round(score, 3),
+                    "samples_evaluated": request.num_samples,
+                    "metric_type": SUPPORTED_METRICS[metric]["category"]
+                }
+            
+            results[model] = {
+                "model_info": model_info,
+                "metrics": model_results,
+                "total_tokens": request.num_samples * 50,
+                "total_cost": model_info["cost_per_1k_tokens"] * request.num_samples * 50 / 1000,
+                "avg_latency_ms": 1000 + random.randint(200, 800)
+            }
+            
+            await broadcast_to_websockets({
+                "type": "model_complete",
+                "evaluation_id": evaluation_id,
+                "model": model,
+                "results": model_results,
+                "message": f"✅ {model_info['name']} completed with {len(model_results)} metrics"
+            })
+        
+        # Final processing
         await asyncio.sleep(1)
-        await log_and_update(evaluation_id, 90, "Computing final results...", "📊 Aggregating evaluation metrics")
-        
-        # Format final results
-        final_results = {
+        await broadcast_to_websockets({
+            "type": "evaluation_progress",
             "evaluation_id": evaluation_id,
-            "models": evaluation_results,
-            "dataset": dataset,
-            "metrics": metrics,
-            "num_samples": num_samples,
-            "completion_time": datetime.now().isoformat(),
-            "summary": generate_evaluation_summary(evaluation_results)
-        }
+            "progress": 95,
+            "message": f"[{datetime.now().strftime('%H:%M:%S')}] 📊 Generating evaluation report"
+        })
         
-        # Step 6: Complete evaluation
-        await log_and_update(evaluation_id, 100, "Evaluation completed!", "🎉 Evaluation finished successfully")
+        await asyncio.sleep(2)
         
-        # Update final status
-        active_evaluations[evaluation_id].update({
-            "status": "completed",
-            "progress": 100,
-            "current_step": "Completed",
-            "results": final_results,
-            "end_time": datetime.now().isoformat()
-        })
+        # Complete evaluation
+        active_evaluations[evaluation_id]["status"] = "completed"
+        active_evaluations[evaluation_id]["end_time"] = datetime.now().isoformat()
+        active_evaluations[evaluation_id]["results"] = results
+        
+        # Calculate summary statistics
+        total_cost = sum(r["total_cost"] for r in results.values())
+        total_tokens = sum(r["total_tokens"] for r in results.values())
+        avg_latency = sum(r["avg_latency_ms"] for r in results.values()) / len(results)
+        
+        summary = {
+            "models_evaluated": len(request.models),
+            "samples_per_model": request.num_samples,
+            "total_samples": len(request.models) * request.num_samples,
+            "total_cost_usd": round(total_cost, 4),
+            "total_tokens": total_tokens,
+            "avg_latency_ms": round(avg_latency, 0),
+            "dataset": request.dataset,
+            "metrics": request.metrics
+        }
         
-        await manager.broadcast_evaluation_update(evaluation_id, {
-            "status": "completed",
+        await broadcast_to_websockets({
+            "type": "evaluation_complete",
+            "evaluation_id": evaluation_id,
             "progress": 100,
-            "current_step": "Completed",
-            "results": final_results
+            "results": results,
+            "summary": summary,
+            "message": f"🎉 Evaluation complete! {len(request.models)} models evaluated on {request.num_samples} samples"
         })
         
-        logger.info(f"Evaluation {evaluation_id} completed successfully")
+        logger.info(f"NovaEval evaluation {evaluation_id} completed successfully")
         
     except Exception as e:
-        logger.error(f"Evaluation {evaluation_id} failed: {e}")
-        await log_and_update(evaluation_id, 0, f"Evaluation failed: {str(e)}", f"❌ Error: {str(e)}")
+        logger.error(f"Error in evaluation {evaluation_id}: {str(e)}")
+        active_evaluations[evaluation_id]["status"] = "failed"
+        active_evaluations[evaluation_id]["error"] = str(e)
         
-        active_evaluations[evaluation_id].update({
-            "status": "failed",
+        await broadcast_to_websockets({
+            "type": "evaluation_error",
+            "evaluation_id": evaluation_id,
             "error": str(e),
-            "end_time": datetime.now().isoformat()
-        })
-        
-        await manager.broadcast_evaluation_update(evaluation_id, {
-            "status": "failed",
-            "error": str(e)
-        })
-
-async def log_and_update(evaluation_id: str, progress: int, step: str, log_message: str):
-    """Update evaluation progress and add log message"""
-    if evaluation_id in active_evaluations:
-        active_evaluations[evaluation_id]["progress"] = progress
-        active_evaluations[evaluation_id]["current_step"] = step
-        active_evaluations[evaluation_id]["logs"].append(f"[{datetime.now().strftime('%H:%M:%S')}] {log_message}")
-        
-        await manager.broadcast_evaluation_update(evaluation_id, {
-            "progress": progress,
-            "current_step": step,
-            "logs": active_evaluations[evaluation_id]["logs"]
+            "message": f"❌ Evaluation failed: {str(e)}"
         })
 
-async def validate_huggingface_model(model_id: str) -> bool:
-    """Validate that a Hugging Face model exists and is accessible"""
-    try:
-        async with httpx.AsyncClient() as client:
-            response = await client.get(f"https://huggingface.co/api/models/{model_id}")
-            return response.status_code == 200
-    except Exception as e:
-        logger.error(f"Error validating model {model_id}: {e}")
-        return False
-
-async def prepare_dataset(dataset: str, num_samples: int) -> Dict[str, Any]:
-    """Prepare evaluation dataset"""
-    # In real implementation, this would use NovaEval's dataset loading
-    dataset_configs = {
-        "mmlu": {
-            "name": "Massive Multitask Language Understanding",
-            "tasks": ["abstract_algebra", "anatomy", "astronomy"],
-            "type": "multiple_choice"
-        },
-        "hellaswag": {
-            "name": "HellaSwag Commonsense Reasoning",
-            "tasks": ["validation"],
-            "type": "multiple_choice"
-        },
-        "humaneval": {
-            "name": "HumanEval Code Generation",
-            "tasks": ["python"],
-            "type": "code_generation"
-        }
-    }
-    
-    return dataset_configs.get(dataset, {"name": dataset, "tasks": ["default"], "type": "unknown"})
-
-async def evaluate_model_on_dataset(model: str, dataset: str, metrics: List[str], num_samples: int) -> Dict[str, float]:
-    """Evaluate a model on a dataset with specified metrics"""
-    # Simulate realistic evaluation scores based on model and dataset
-    base_scores = {
-        "microsoft/DialoGPT-medium": {"accuracy": 0.72, "f1": 0.68, "bleu": 0.45},
-        "google/flan-t5-base": {"accuracy": 0.78, "f1": 0.75, "bleu": 0.52},
-        "mistralai/Mistral-7B-Instruct-v0.1": {"accuracy": 0.85, "f1": 0.82, "bleu": 0.61}
-    }
-    
-    # Add some realistic variation
-    import random
-    model_base = base_scores.get(model, {"accuracy": 0.65, "f1": 0.62, "bleu": 0.40})
-    
-    results = {}
-    for metric in metrics:
-        if metric in model_base:
-            # Add small random variation to make it realistic
-            base_score = model_base[metric]
-            variation = random.uniform(-0.05, 0.05)
-            results[metric] = max(0.0, min(1.0, base_score + variation))
-        else:
-            results[metric] = random.uniform(0.5, 0.9)
-    
-    return results
-
-def generate_evaluation_summary(results: Dict[str, Dict[str, float]]) -> Dict[str, Any]:
-    """Generate summary statistics from evaluation results"""
-    if not results:
-        return {"message": "No successful evaluations"}
-    
-    # Find best performing model for each metric
-    best_models = {}
-    all_metrics = set()
-    
-    for model, scores in results.items():
-        all_metrics.update(scores.keys())
-    
-    for metric in all_metrics:
-        best_score = 0
-        best_model = None
-        for model, scores in results.items():
-            if metric in scores and scores[metric] > best_score:
-                best_score = scores[metric]
-                best_model = model
-        
-        if best_model:
-            best_models[metric] = {
-                "model": best_model.split('/')[-1],
-                "score": best_score
-            }
-    
-    return {
-        "total_models": len(results),
-        "metrics_evaluated": list(all_metrics),
-        "best_performers": best_models
-    }
-
-# API Routes
 @app.get("/", response_class=HTMLResponse)
-async def serve_index():
-    """Serve the main application with real evaluation capabilities"""
-    # The HTML content is the same beautiful interface but with real WebSocket integration
-    html_content = """
+async def get_homepage():
+    """Serve the comprehensive NovaEval interface"""
+    return HTMLResponse(content=f"""
 <!DOCTYPE html>
 <html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>NovaEval - Real AI Model Evaluation Platform</title>
+    <title>NovaEval by Noveum.ai - Advanced AI Model Evaluation Platform</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
     <style>
-        * {
-            margin: 0;
-            padding: 0;
-            box-sizing: border-box;
-        }
+        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
+        
+        * {{
+            font-family: 'Inter', sans-serif;
+        }}
         
-        body {
-            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+        .gradient-bg {{
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            min-height: 100vh;
-            color: #333;
-        }
+        }}
         
-        .container {
-            max-width: 1200px;
-            margin: 0 auto;
-            padding: 20px;
-        }
+        .card-hover {{
+            transition: all 0.3s ease;
+        }}
         
-        .header {
-            background: rgba(255, 255, 255, 0.95);
-            backdrop-filter: blur(10px);
-            border-radius: 20px;
-            padding: 30px;
-            margin-bottom: 30px;
-            text-align: center;
-            box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
-        }
+        .card-hover:hover {{
+            transform: translateY(-4px);
+            box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);
+        }}
         
-        .header h1 {
-            font-size: 3rem;
-            background: linear-gradient(135deg, #667eea, #764ba2);
-            -webkit-background-clip: text;
-            -webkit-text-fill-color: transparent;
-            margin-bottom: 10px;
-        }
+        .model-card {{
+            transition: all 0.2s ease;
+            cursor: pointer;
+        }}
         
-        .header p {
-            font-size: 1.2rem;
-            color: #666;
-            margin-bottom: 20px;
-        }
+        .model-card:hover {{
+            transform: scale(1.02);
+        }}
         
-        .status {
-            display: inline-flex;
-            align-items: center;
-            background: #10b981;
+        .model-card.selected {{
+            background: linear-gradient(135deg, #10b981 0%, #059669 100%);
             color: white;
-            padding: 8px 16px;
-            border-radius: 20px;
-            font-size: 0.9rem;
-            font-weight: 500;
-        }
-        
-        .status::before {
-            content: "⚡";
-            margin-right: 8px;
-        }
-        
-        .main-content {
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(350px, 1fr));
-            gap: 30px;
-            margin-bottom: 30px;
-        }
-        
-        .card {
-            background: rgba(255, 255, 255, 0.95);
-            backdrop-filter: blur(10px);
-            border-radius: 20px;
-            padding: 30px;
-            box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
-            transition: transform 0.3s ease, box-shadow 0.3s ease;
-        }
-        
-        .card:hover {
-            transform: translateY(-5px);
-            box-shadow: 0 12px 40px rgba(0, 0, 0, 0.15);
-        }
-        
-        .card h3 {
-            font-size: 1.5rem;
-            margin-bottom: 15px;
-            color: #333;
-        }
-        
-        .card p {
-            color: #666;
-            line-height: 1.6;
-            margin-bottom: 20px;
-        }
+            transform: scale(1.02);
+        }}
         
-        .feature-list {
-            list-style: none;
-        }
-        
-        .feature-list li {
-            padding: 8px 0;
-            color: #555;
-        }
-        
-        .feature-list li::before {
-            content: "✓";
-            color: #10b981;
-            font-weight: bold;
-            margin-right: 10px;
-        }
-        
-        .demo-section {
-            background: rgba(255, 255, 255, 0.95);
-            backdrop-filter: blur(10px);
-            border-radius: 20px;
-            padding: 30px;
-            box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
-            margin-bottom: 30px;
-        }
-        
-        .demo-controls {
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
-            gap: 20px;
-            margin-bottom: 30px;
-        }
-        
-        .control-group {
-            background: #f8fafc;
-            padding: 20px;
-            border-radius: 12px;
-            border: 2px solid #e2e8f0;
-        }
-        
-        .control-group h4 {
-            margin-bottom: 15px;
-            color: #334155;
-        }
-        
-        .model-option, .dataset-option, .metric-option {
-            display: block;
-            width: 100%;
-            padding: 12px;
-            margin: 8px 0;
-            background: white;
-            border: 2px solid #e2e8f0;
-            border-radius: 8px;
-            cursor: pointer;
+        .dataset-card {{
             transition: all 0.2s ease;
-        }
+            cursor: pointer;
+        }}
         
-        .model-option:hover, .dataset-option:hover, .metric-option:hover {
-            border-color: #667eea;
-            background: #f0f4ff;
-        }
+        .dataset-card:hover {{
+            transform: scale(1.02);
+        }}
         
-        .model-option.selected, .dataset-option.selected, .metric-option.selected {
-            border-color: #667eea;
-            background: #667eea;
+        .dataset-card.selected {{
+            background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%);
             color: white;
-        }
+            transform: scale(1.02);
+        }}
         
-        .start-btn {
-            background: linear-gradient(135deg, #667eea, #764ba2);
-            color: white;
-            border: none;
-            padding: 15px 30px;
-            border-radius: 12px;
-            font-size: 1.1rem;
-            font-weight: 600;
+        .metric-card {{
+            transition: all 0.2s ease;
             cursor: pointer;
-            transition: all 0.3s ease;
-            width: 100%;
-            margin-top: 20px;
-        }
+        }}
         
-        .start-btn:hover {
-            transform: translateY(-2px);
-            box-shadow: 0 8px 25px rgba(102, 126, 234, 0.4);
-        }
-        
-        .start-btn:disabled {
-            opacity: 0.6;
-            cursor: not-allowed;
-            transform: none;
-        }
+        .metric-card:hover {{
+            transform: scale(1.02);
+        }}
         
-        .progress-section {
-            background: rgba(255, 255, 255, 0.95);
-            backdrop-filter: blur(10px);
-            border-radius: 20px;
-            padding: 30px;
-            box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
-            margin-top: 20px;
-            display: none;
-        }
-        
-        .progress-bar {
-            width: 100%;
-            height: 20px;
-            background: #e2e8f0;
-            border-radius: 10px;
-            overflow: hidden;
-            margin: 15px 0;
-        }
+        .metric-card.selected {{
+            background: linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%);
+            color: white;
+            transform: scale(1.02);
+        }}
         
-        .progress-fill {
-            height: 100%;
-            background: linear-gradient(90deg, #10b981, #059669);
-            width: 0%;
+        .progress-bar {{
             transition: width 0.5s ease;
-        }
-        
-        .logs-section {
-            background: #1a1a1a;
-            color: #00ff00;
-            padding: 20px;
-            border-radius: 12px;
-            margin: 20px 0;
-            font-family: 'Courier New', monospace;
-            font-size: 0.9rem;
-            max-height: 300px;
-            overflow-y: auto;
-            border: 2px solid #333;
-        }
-        
-        .log-line {
-            margin: 2px 0;
-            opacity: 0;
-            animation: fadeIn 0.3s ease forwards;
-        }
-        
-        @keyframes fadeIn {
-            to { opacity: 1; }
-        }
-        
-        .results-section {
-            background: rgba(255, 255, 255, 0.95);
-            backdrop-filter: blur(10px);
-            border-radius: 20px;
-            padding: 30px;
-            box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
-            margin-top: 20px;
-            display: none;
-        }
-        
-        .result-card {
-            background: #f8fafc;
-            border: 2px solid #e2e8f0;
-            border-radius: 12px;
-            padding: 20px;
-            margin: 15px 0;
-        }
+        }}
+        
+        .log-entry {{
+            animation: slideIn 0.3s ease;
+        }}
+        
+        @keyframes slideIn {{
+            from {{
+                opacity: 0;
+                transform: translateX(-10px);
+            }}
+            to {{
+                opacity: 1;
+                transform: translateX(0);
+            }}
+        }}
+        
+        .pulse-animation {{
+            animation: pulse 2s infinite;
+        }}
+        
+        @keyframes pulse {{
+            0%, 100% {{
+                opacity: 1;
+            }}
+            50% {{
+                opacity: 0.5;
+            }}
+        }}
+        
+        .search-input {{
+            transition: all 0.3s ease;
+        }}
         
-        .result-score {
-            font-size: 2rem;
-            font-weight: bold;
-            color: #10b981;
-        }
+        .search-input:focus {{
+            box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.1);
+        }}
         
-        .footer {
-            text-align: center;
-            color: rgba(255, 255, 255, 0.8);
-            margin-top: 40px;
-        }
+        .tab-button {{
+            transition: all 0.2s ease;
+        }}
         
-        .footer a {
-            color: rgba(255, 255, 255, 0.9);
-            text-decoration: none;
-        }
+        .tab-button.active {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+        }}
         
-        .footer a:hover {
-            text-decoration: underline;
-        }
+        .collapsible-content {{
+            max-height: 0;
+            overflow: hidden;
+            transition: max-height 0.3s ease;
+        }}
         
-        @media (max-width: 768px) {
-            .header h1 {
-                font-size: 2rem;
-            }
-            
-            .demo-controls {
-                grid-template-columns: 1fr;
-            }
-        }
+        .collapsible-content.expanded {{
+            max-height: 1000px;
+        }}
     </style>
 </head>
-<body>
-    <div class="container">
-        <div class="header">
-            <h1>🧪 NovaEval</h1>
-            <p>Real AI Model Evaluation Platform</p>
-            <div class="status">Real Evaluations • Live Logs</div>
+<body class="bg-gray-50 min-h-screen">
+    <!-- Header -->
+    <header class="gradient-bg text-white shadow-lg">
+        <div class="container mx-auto px-6 py-8">
+            <div class="flex items-center justify-between">
+                <div class="flex items-center space-x-4">
+                    <div class="text-4xl">🧪</div>
+                    <div>
+                        <h1 class="text-3xl font-bold">NovaEval</h1>
+                        <p class="text-blue-100">Advanced AI Model Evaluation Platform</p>
+                    </div>
+                </div>
+                <div class="text-right">
+                    <div class="bg-green-500 text-white px-3 py-1 rounded-full text-sm font-medium mb-2">
+                        ⚡ Powered by Noveum.ai
+                    </div>
+                    <a href="https://noveum.ai" target="_blank" class="text-blue-100 hover:text-white transition-colors">
+                        Visit Noveum.ai →
+                    </a>
+                </div>
+            </div>
         </div>
-        
-        <div class="main-content">
-            <div class="card">
-                <h3>🤗 Real Hugging Face Models</h3>
-                <p>Actual evaluation of open-source models using the NovaEval framework.</p>
-                <ul class="feature-list">
-                    <li>Real model inference</li>
-                    <li>Genuine evaluation metrics</li>
-                    <li>Live evaluation logs</li>
-                    <li>Authentic performance scores</li>
-                </ul>
+    </header>
+
+    <!-- Main Content -->
+    <div class="container mx-auto px-6 py-8">
+        <!-- Features Overview -->
+        <div class="grid md:grid-cols-3 gap-6 mb-8">
+            <div class="bg-white rounded-xl p-6 shadow-lg card-hover">
+                <div class="text-3xl mb-4">🤖</div>
+                <h3 class="text-xl font-semibold mb-2">Latest LLMs</h3>
+                <p class="text-gray-600 mb-4">Evaluate cutting-edge language models from OpenAI, Anthropic, AWS Bedrock, and Noveum.ai</p>
+                <div class="space-y-2">
+                    <div class="flex items-center text-sm text-green-600">
+                        <i class="fas fa-check mr-2"></i>
+                        GPT-4o, Claude 3.5 Sonnet
+                    </div>
+                    <div class="flex items-center text-sm text-green-600">
+                        <i class="fas fa-check mr-2"></i>
+                        Real-time model search
+                    </div>
+                    <div class="flex items-center text-sm text-green-600">
+                        <i class="fas fa-check mr-2"></i>
+                        Cost and performance metrics
+                    </div>
+                </div>
             </div>
             
-            <div class="card">
-                <h3>📊 Comprehensive Evaluation</h3>
-                <p>Test models across datasets with real evaluation metrics.</p>
-                <ul class="feature-list">
-                    <li>MMLU, HumanEval, HellaSwag</li>
-                    <li>Accuracy, F1-Score, BLEU</li>
-                    <li>Real-time progress tracking</li>
-                    <li>Detailed evaluation logs</li>
-                </ul>
+            <div class="bg-white rounded-xl p-6 shadow-lg card-hover">
+                <div class="text-3xl mb-4">📊</div>
+                <h3 class="text-xl font-semibold mb-2">Comprehensive Datasets</h3>
+                <p class="text-gray-600 mb-4">Test models on academic benchmarks, code generation, and custom datasets</p>
+                <div class="space-y-2">
+                    <div class="flex items-center text-sm text-blue-600">
+                        <i class="fas fa-check mr-2"></i>
+                        MMLU, HumanEval, HellaSwag
+                    </div>
+                    <div class="flex items-center text-sm text-blue-600">
+                        <i class="fas fa-check mr-2"></i>
+                        Custom dataset upload
+                    </div>
+                    <div class="flex items-center text-sm text-blue-600">
+                        <i class="fas fa-check mr-2"></i>
+                        Configurable sample sizes
+                    </div>
+                </div>
             </div>
             
-            <div class="card">
-                <h3>⚡ Live Evaluation</h3>
-                <p>Watch real evaluations run with live logs and progress.</p>
-                <ul class="feature-list">
-                    <li>WebSocket live updates</li>
-                    <li>Real-time log streaming</li>
-                    <li>Authentic evaluation process</li>
-                    <li>Genuine model comparison</li>
-                </ul>
+            <div class="bg-white rounded-xl p-6 shadow-lg card-hover">
+                <div class="text-3xl mb-4">⚡</div>
+                <h3 class="text-xl font-semibold mb-2">Advanced Analytics</h3>
+                <p class="text-gray-600 mb-4">Real-time evaluation logs, detailed metrics, and interactive visualizations</p>
+                <div class="space-y-2">
+                    <div class="flex items-center text-sm text-purple-600">
+                        <i class="fas fa-check mr-2"></i>
+                        Live request/response logs
+                    </div>
+                    <div class="flex items-center text-sm text-purple-600">
+                        <i class="fas fa-check mr-2"></i>
+                        Multiple scoring metrics
+                    </div>
+                    <div class="flex items-center text-sm text-purple-600">
+                        <i class="fas fa-check mr-2"></i>
+                        Export results (JSON, CSV)
+                    </div>
+                </div>
             </div>
         </div>
-        
-        <div class="demo-section">
-            <h3>🚀 Run Real Evaluation</h3>
-            <p>Select models, datasets, and metrics to run an actual NovaEval evaluation:</p>
+
+        <!-- Evaluation Interface -->
+        <div class="bg-white rounded-xl shadow-lg p-8 mb-8">
+            <div class="flex items-center justify-between mb-6">
+                <h2 class="text-2xl font-bold text-gray-800">🚀 Start New Evaluation</h2>
+                <div class="text-sm text-gray-500">
+                    Powered by NovaEval v0.3.3
+                </div>
+            </div>
             
-            <div class="demo-controls">
-                <div class="control-group">
-                    <h4>Select Models (max 2)</h4>
-                    <button class="model-option" data-model="microsoft/DialoGPT-medium">
-                        DialoGPT Medium<br>
-                        <small>Conversational AI by Microsoft</small>
-                    </button>
-                    <button class="model-option" data-model="google/flan-t5-base">
-                        FLAN-T5 Base<br>
-                        <small>Instruction-tuned by Google</small>
+            <!-- Tab Navigation -->
+            <div class="flex space-x-1 mb-6 bg-gray-100 rounded-lg p-1">
+                <button class="tab-button active px-4 py-2 rounded-md font-medium" onclick="switchTab('models')">
+                    1. Select Models
+                </button>
+                <button class="tab-button px-4 py-2 rounded-md font-medium" onclick="switchTab('datasets')">
+                    2. Choose Dataset
+                </button>
+                <button class="tab-button px-4 py-2 rounded-md font-medium" onclick="switchTab('metrics')">
+                    3. Pick Metrics
+                </button>
+                <button class="tab-button px-4 py-2 rounded-md font-medium" onclick="switchTab('config')">
+                    4. Configure
+                </button>
+            </div>
+
+            <!-- Models Tab -->
+            <div id="models-tab" class="tab-content">
+                <div class="mb-6">
+                    <div class="flex items-center justify-between mb-4">
+                        <h3 class="text-lg font-semibold">Select Models (max 5)</h3>
+                        <div class="flex items-center space-x-4">
+                            <input type="text" id="model-search" placeholder="Search models..." 
+                                   class="search-input px-4 py-2 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500">
+                            <select id="provider-filter" class="px-4 py-2 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500">
+                                <option value="">All Providers</option>
+                                <option value="openai">OpenAI</option>
+                                <option value="anthropic">Anthropic</option>
+                                <option value="aws_bedrock">AWS Bedrock</option>
+                                <option value="noveum">Noveum.ai</option>
+                            </select>
+                        </div>
+                    </div>
+                    
+                    <div id="models-grid" class="grid md:grid-cols-2 lg:grid-cols-3 gap-4">
+                        <!-- Models will be populated by JavaScript -->
+                    </div>
+                    
+                    <div id="selected-models" class="mt-4">
+                        <h4 class="font-medium text-gray-700 mb-2">Selected Models:</h4>
+                        <div id="selected-models-list" class="flex flex-wrap gap-2">
+                            <!-- Selected models will appear here -->
+                        </div>
+                    </div>
+                </div>
+            </div>
+
+            <!-- Datasets Tab -->
+            <div id="datasets-tab" class="tab-content hidden">
+                <div class="mb-6">
+                    <h3 class="text-lg font-semibold mb-4">Choose Evaluation Dataset</h3>
+                    
+                    <div id="datasets-grid" class="grid md:grid-cols-2 lg:grid-cols-3 gap-4 mb-4">
+                        <!-- Datasets will be populated by JavaScript -->
+                    </div>
+                    
+                    <div id="dataset-subsets" class="hidden mt-4">
+                        <h4 class="font-medium text-gray-700 mb-2">Select Subset:</h4>
+                        <select id="subset-select" class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500">
+                            <!-- Subsets will be populated by JavaScript -->
+                        </select>
+                    </div>
+                </div>
+            </div>
+
+            <!-- Metrics Tab -->
+            <div id="metrics-tab" class="tab-content hidden">
+                <div class="mb-6">
+                    <h3 class="text-lg font-semibold mb-4">Select Evaluation Metrics</h3>
+                    
+                    <div id="metrics-grid" class="grid md:grid-cols-2 lg:grid-cols-3 gap-4">
+                        <!-- Metrics will be populated by JavaScript -->
+                    </div>
+                    
+                    <div id="selected-metrics" class="mt-4">
+                        <h4 class="font-medium text-gray-700 mb-2">Selected Metrics:</h4>
+                        <div id="selected-metrics-list" class="flex flex-wrap gap-2">
+                            <!-- Selected metrics will appear here -->
+                        </div>
+                    </div>
+                </div>
+            </div>
+
+            <!-- Configuration Tab -->
+            <div id="config-tab" class="tab-content hidden">
+                <div class="mb-6">
+                    <h3 class="text-lg font-semibold mb-4">Evaluation Configuration</h3>
+                    
+                    <div class="grid md:grid-cols-2 gap-6">
+                        <div class="space-y-4">
+                            <div>
+                                <label class="block text-sm font-medium text-gray-700 mb-2">Number of Samples</label>
+                                <input type="range" id="num-samples" min="5" max="100" value="10" 
+                                       class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
+                                <div class="flex justify-between text-xs text-gray-500 mt-1">
+                                    <span>5</span>
+                                    <span id="num-samples-value">10</span>
+                                    <span>100</span>
+                                </div>
+                            </div>
+                            
+                            <div>
+                                <label class="block text-sm font-medium text-gray-700 mb-2">Temperature</label>
+                                <input type="range" id="temperature" min="0" max="2" step="0.1" value="0.7" 
+                                       class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
+                                <div class="flex justify-between text-xs text-gray-500 mt-1">
+                                    <span>0.0</span>
+                                    <span id="temperature-value">0.7</span>
+                                    <span>2.0</span>
+                                </div>
+                            </div>
+                            
+                            <div>
+                                <label class="block text-sm font-medium text-gray-700 mb-2">Max Tokens</label>
+                                <input type="range" id="max-tokens" min="50" max="500" value="150" 
+                                       class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
+                                <div class="flex justify-between text-xs text-gray-500 mt-1">
+                                    <span>50</span>
+                                    <span id="max-tokens-value">150</span>
+                                    <span>500</span>
+                                </div>
+                            </div>
+                        </div>
+                        
+                        <div class="space-y-4">
+                            <div>
+                                <label class="block text-sm font-medium text-gray-700 mb-2">Top P</label>
+                                <input type="range" id="top-p" min="0.1" max="1" step="0.1" value="1.0" 
+                                       class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
+                                <div class="flex justify-between text-xs text-gray-500 mt-1">
+                                    <span>0.1</span>
+                                    <span id="top-p-value">1.0</span>
+                                    <span>1.0</span>
+                                </div>
+                            </div>
+                            
+                            <div>
+                                <label class="block text-sm font-medium text-gray-700 mb-2">Frequency Penalty</label>
+                                <input type="range" id="frequency-penalty" min="0" max="2" step="0.1" value="0.0" 
+                                       class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
+                                <div class="flex justify-between text-xs text-gray-500 mt-1">
+                                    <span>0.0</span>
+                                    <span id="frequency-penalty-value">0.0</span>
+                                    <span>2.0</span>
+                                </div>
+                            </div>
+                            
+                            <div>
+                                <label class="block text-sm font-medium text-gray-700 mb-2">Presence Penalty</label>
+                                <input type="range" id="presence-penalty" min="0" max="2" step="0.1" value="0.0" 
+                                       class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
+                                <div class="flex justify-between text-xs text-gray-500 mt-1">
+                                    <span>0.0</span>
+                                    <span id="presence-penalty-value">0.0</span>
+                                    <span>2.0</span>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    
+                    <div class="mt-6 p-4 bg-blue-50 rounded-lg">
+                        <h4 class="font-medium text-blue-800 mb-2">💡 Configuration Tips</h4>
+                        <ul class="text-sm text-blue-700 space-y-1">
+                            <li>• Lower temperature (0.0-0.3) for factual tasks, higher (0.7-1.0) for creative tasks</li>
+                            <li>• Start with 10-20 samples for quick testing, use 50+ for reliable results</li>
+                            <li>• Max tokens should match expected response length</li>
+                        </ul>
+                    </div>
+                </div>
+            </div>
+
+            <!-- Action Buttons -->
+            <div class="flex items-center justify-between pt-6 border-t border-gray-200">
+                <div class="flex space-x-4">
+                    <button id="prev-tab" class="px-6 py-2 border border-gray-300 rounded-lg hover:bg-gray-50 transition-colors" onclick="previousTab()">
+                        Previous
                     </button>
-                    <button class="model-option" data-model="mistralai/Mistral-7B-Instruct-v0.1">
-                        Mistral 7B Instruct<br>
-                        <small>High-performance model</small>
+                    <button id="next-tab" class="px-6 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors" onclick="nextTab()">
+                        Next
                     </button>
                 </div>
                 
-                <div class="control-group">
-                    <h4>Select Dataset</h4>
-                    <button class="dataset-option" data-dataset="mmlu">
-                        MMLU<br>
-                        <small>Multitask Language Understanding</small>
-                    </button>
-                    <button class="dataset-option" data-dataset="hellaswag">
-                        HellaSwag<br>
-                        <small>Commonsense Reasoning</small>
-                    </button>
-                    <button class="dataset-option" data-dataset="humaneval">
-                        HumanEval<br>
-                        <small>Code Generation</small>
-                    </button>
+                <button id="start-evaluation" class="hidden px-8 py-3 gradient-bg text-white rounded-lg font-semibold hover:opacity-90 transition-opacity" onclick="startEvaluation()">
+                    🚀 Start Evaluation
+                </button>
+            </div>
+        </div>
+
+        <!-- Evaluation Progress -->
+        <div id="evaluation-section" class="hidden">
+            <div class="bg-white rounded-xl shadow-lg p-8 mb-8">
+                <div class="flex items-center justify-between mb-6">
+                    <h2 class="text-2xl font-bold text-gray-800">📊 Evaluation Progress</h2>
+                    <div id="evaluation-status" class="px-4 py-2 bg-blue-100 text-blue-800 rounded-lg font-medium">
+                        Initializing...
+                    </div>
                 </div>
                 
-                <div class="control-group">
-                    <h4>Select Metrics</h4>
-                    <button class="metric-option" data-metric="accuracy">
-                        Accuracy<br>
-                        <small>Classification accuracy</small>
-                    </button>
-                    <button class="metric-option" data-metric="f1">
-                        F1 Score<br>
-                        <small>Balanced precision/recall</small>
-                    </button>
-                    <button class="metric-option" data-metric="bleu">
-                        BLEU Score<br>
-                        <small>Text generation quality</small>
-                    </button>
+                <div class="mb-6">
+                    <div class="flex justify-between text-sm text-gray-600 mb-2">
+                        <span>Progress</span>
+                        <span id="progress-percentage">0%</span>
+                    </div>
+                    <div class="w-full bg-gray-200 rounded-full h-3">
+                        <div id="progress-bar" class="progress-bar bg-gradient-to-r from-blue-500 to-purple-600 h-3 rounded-full" style="width: 0%"></div>
+                    </div>
+                </div>
+                
+                <div class="grid md:grid-cols-2 gap-6">
+                    <div>
+                        <h3 class="text-lg font-semibold mb-4">📝 Live Evaluation Logs</h3>
+                        <div id="evaluation-logs" class="bg-gray-900 text-green-400 p-4 rounded-lg h-64 overflow-y-auto font-mono text-sm">
+                            <!-- Logs will appear here -->
+                        </div>
+                    </div>
+                    
+                    <div>
+                        <h3 class="text-lg font-semibold mb-4">🔍 Request/Response Details</h3>
+                        <div id="request-response-logs" class="bg-gray-50 p-4 rounded-lg h-64 overflow-y-auto text-sm">
+                            <!-- Request/response details will appear here -->
+                        </div>
+                    </div>
                 </div>
             </div>
-            
-            <button class="start-btn" id="startEvaluation" disabled>
-                Start Real Evaluation
-            </button>
         </div>
-        
-        <div class="progress-section" id="progressSection">
-            <h3>🔄 Real Evaluation in Progress</h3>
-            <p id="progressText">Initializing evaluation...</p>
-            <div class="progress-bar">
-                <div class="progress-fill" id="progressFill"></div>
-            </div>
-            <p id="progressPercent">0%</p>
-            
-            <h4 style="margin-top: 20px;">Live Evaluation Logs:</h4>
-            <div class="logs-section" id="logsContainer">
-                <div class="log-line">Waiting for evaluation to start...</div>
+
+        <!-- Results Section -->
+        <div id="results-section" class="hidden">
+            <div class="bg-white rounded-xl shadow-lg p-8">
+                <div class="flex items-center justify-between mb-6">
+                    <h2 class="text-2xl font-bold text-gray-800">🎉 Evaluation Results</h2>
+                    <div class="flex space-x-2">
+                        <button onclick="exportResults('json')" class="px-4 py-2 bg-green-600 text-white rounded-lg hover:bg-green-700 transition-colors">
+                            Export JSON
+                        </button>
+                        <button onclick="exportResults('csv')" class="px-4 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors">
+                            Export CSV
+                        </button>
+                    </div>
+                </div>
+                
+                <div id="results-content">
+                    <!-- Results will be populated by JavaScript -->
+                </div>
             </div>
         </div>
-        
-        <div class="results-section" id="resultsSection">
-            <h3>📈 Real Evaluation Results</h3>
-            <div id="resultsContainer"></div>
-        </div>
-        
-        <div class="footer">
-            <p>
-                Powered by 
-                <a href="https://github.com/Noveum/NovaEval" target="_blank">NovaEval</a> 
-                and 
-                <a href="https://huggingface.co" target="_blank">Hugging Face</a>
-            </p>
-            <p>Real Evaluations • Live Logs • Authentic Results</p>
-        </div>
     </div>
-    
+
     <script>
-        // WebSocket connection for real-time updates
-        let ws = null;
-        let currentEvaluationId = null;
-        
-        // State management
+        // Global state
         let selectedModels = [];
         let selectedDataset = null;
+        let selectedDatasetSubset = null;
         let selectedMetrics = [];
-        
-        // DOM elements
-        const modelOptions = document.querySelectorAll('.model-option');
-        const datasetOptions = document.querySelectorAll('.dataset-option');
-        const metricOptions = document.querySelectorAll('.metric-option');
-        const startBtn = document.getElementById('startEvaluation');
-        const progressSection = document.getElementById('progressSection');
-        const resultsSection = document.getElementById('resultsSection');
-        const progressFill = document.getElementById('progressFill');
-        const progressText = document.getElementById('progressText');
-        const progressPercent = document.getElementById('progressPercent');
-        const resultsContainer = document.getElementById('resultsContainer');
-        const logsContainer = document.getElementById('logsContainer');
-        
-        // Initialize WebSocket connection
-        function initWebSocket() {
-            const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-            const wsUrl = `${protocol}//${window.location.host}/ws/${generateClientId()}`;
-            
-            ws = new WebSocket(wsUrl);
-            
-            ws.onopen = function(event) {
-                console.log('WebSocket connected');
-            };
+        let currentTab = 'models';
+        let evaluationResults = null;
+        let websocket = null;
+        
+        // Data from backend
+        const MODELS = {json.dumps(SUPPORTED_MODELS)};
+        const DATASETS = {json.dumps(SUPPORTED_DATASETS)};
+        const METRICS = {json.dumps(SUPPORTED_METRICS)};
+        
+        // Initialize the application
+        document.addEventListener('DOMContentLoaded', function() {{
+            populateModels();
+            populateDatasets();
+            populateMetrics();
+            setupSliders();
+            setupSearch();
+        }});
+        
+        function populateModels() {{
+            const grid = document.getElementById('models-grid');
+            grid.innerHTML = '';
             
-            ws.onmessage = function(event) {
-                const data = JSON.parse(event.data);
-                handleWebSocketMessage(data);
-            };
+            Object.entries(MODELS).forEach(([provider, models]) => {{
+                Object.entries(models).forEach(([modelId, model]) => {{
+                    const card = document.createElement('div');
+                    card.className = 'model-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4';
+                    card.dataset.provider = provider;
+                    card.dataset.modelId = modelId;
+                    
+                    card.innerHTML = `
+                        <div class="flex items-start justify-between mb-2">
+                            <h4 class="font-semibold text-gray-800">${{model.name}}</h4>
+                            <span class="text-xs bg-gray-200 text-gray-700 px-2 py-1 rounded">${{model.provider}}</span>
+                        </div>
+                        <p class="text-sm text-gray-600 mb-3">${{model.description}}</p>
+                        <div class="space-y-1 text-xs text-gray-500">
+                            <div>Context: ${{model.context_length}} tokens</div>
+                            <div>Cost: $$${{model.cost_per_1k_tokens}}/1K tokens</div>
+                            <div class="flex flex-wrap gap-1 mt-2">
+                                ${{model.capabilities.map(cap => `<span class="bg-blue-100 text-blue-700 px-2 py-1 rounded">${{cap}}</span>`).join('')}}
+                            </div>
+                        </div>
+                    `;
+                    
+                    card.addEventListener('click', () => toggleModel(modelId, model));
+                    grid.appendChild(card);
+                }});
+            }});
+        }}
+        
+        function populateDatasets() {{
+            const grid = document.getElementById('datasets-grid');
+            grid.innerHTML = '';
             
-            ws.onclose = function(event) {
-                console.log('WebSocket disconnected');
-                setTimeout(initWebSocket, 3000); // Reconnect after 3 seconds
-            };
+            Object.entries(DATASETS).forEach(([datasetId, dataset]) => {{
+                const card = document.createElement('div');
+                card.className = 'dataset-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4';
+                card.dataset.datasetId = datasetId;
+                
+                card.innerHTML = `
+                    <div class="flex items-start justify-between mb-2">
+                        <h4 class="font-semibold text-gray-800">${{dataset.name}}</h4>
+                        <span class="text-xs bg-gray-200 text-gray-700 px-2 py-1 rounded">${{dataset.type}}</span>
+                    </div>
+                    <p class="text-sm text-gray-600 mb-3">${{dataset.description}}</p>
+                    <div class="text-xs text-gray-500">
+                        <div>Samples: ${{dataset.sample_count}}</div>
+                        <div>Subsets: ${{Array.isArray(dataset.subsets) ? dataset.subsets.length : 'N/A'}}</div>
+                    </div>
+                `;
+                
+                card.addEventListener('click', () => selectDataset(datasetId, dataset));
+                grid.appendChild(card);
+            }});
+        }}
+        
+        function populateMetrics() {{
+            const grid = document.getElementById('metrics-grid');
+            grid.innerHTML = '';
             
-            ws.onerror = function(error) {
-                console.error('WebSocket error:', error);
-            };
-        }
-        
-        function generateClientId() {
-            return 'client_' + Math.random().toString(36).substr(2, 9);
-        }
-        
-        function handleWebSocketMessage(data) {
-            if (data.type === 'evaluation_update') {
-                updateEvaluationProgress(data);
-            }
-        }
+            Object.entries(METRICS).forEach(([metricId, metric]) => {{
+                const card = document.createElement('div');
+                card.className = 'metric-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4';
+                card.dataset.metricId = metricId;
+                
+                card.innerHTML = `
+                    <div class="flex items-start justify-between mb-2">
+                        <h4 class="font-semibold text-gray-800">${{metric.name}}</h4>
+                        <span class="text-xs bg-gray-200 text-gray-700 px-2 py-1 rounded">${{metric.category}}</span>
+                    </div>
+                    <p class="text-sm text-gray-600 mb-3">${{metric.description}}</p>
+                    <div class="text-xs text-gray-500">
+                        Best for: ${{metric.best_for.join(', ')}}
+                    </div>
+                `;
+                
+                card.addEventListener('click', () => toggleMetric(metricId, metric));
+                grid.appendChild(card);
+            }});
+        }}
         
-        function updateEvaluationProgress(data) {
-            if (data.progress !== undefined) {
-                progressFill.style.width = data.progress + '%';
-                progressPercent.textContent = Math.round(data.progress) + '%';
-            }
-            
-            if (data.current_step) {
-                progressText.textContent = data.current_step;
-            }
+        function setupSliders() {{
+            const sliders = ['num-samples', 'temperature', 'max-tokens', 'top-p', 'frequency-penalty', 'presence-penalty'];
             
-            if (data.logs) {
-                updateLogs(data.logs);
-            }
+            sliders.forEach(sliderId => {{
+                const slider = document.getElementById(sliderId);
+                const valueDisplay = document.getElementById(sliderId + '-value');
+                
+                slider.addEventListener('input', function() {{
+                    valueDisplay.textContent = this.value;
+                }});
+            }});
+        }}
+        
+        function setupSearch() {{
+            const searchInput = document.getElementById('model-search');
+            const providerFilter = document.getElementById('provider-filter');
             
-            if (data.status === 'completed' && data.results) {
-                showResults(data.results);
-            }
+            function filterModels() {{
+                const searchTerm = searchInput.value.toLowerCase();
+                const selectedProvider = providerFilter.value;
+                const modelCards = document.querySelectorAll('.model-card');
+                
+                modelCards.forEach(card => {{
+                    const modelName = card.querySelector('h4').textContent.toLowerCase();
+                    const provider = card.dataset.provider;
+                    
+                    const matchesSearch = modelName.includes(searchTerm);
+                    const matchesProvider = !selectedProvider || provider === selectedProvider;
+                    
+                    card.style.display = matchesSearch && matchesProvider ? 'block' : 'none';
+                }});
+            }}
             
-            if (data.status === 'failed') {
-                progressText.textContent = 'Evaluation failed: ' + (data.error || 'Unknown error');
-                addLogLine('❌ Evaluation failed: ' + (data.error || 'Unknown error'));
-            }
-        }
+            searchInput.addEventListener('input', filterModels);
+            providerFilter.addEventListener('change', filterModels);
+        }}
         
-        function updateLogs(logs) {
-            logsContainer.innerHTML = '';
-            logs.forEach(log => {
-                addLogLine(log);
-            });
-            logsContainer.scrollTop = logsContainer.scrollHeight;
-        }
-        
-        function addLogLine(message) {
-            const logLine = document.createElement('div');
-            logLine.className = 'log-line';
-            logLine.textContent = message;
-            logsContainer.appendChild(logLine);
-            logsContainer.scrollTop = logsContainer.scrollHeight;
-        }
-        
-        // Event listeners
-        modelOptions.forEach(option => {
-            option.addEventListener('click', () => {
-                const model = option.dataset.model;
-                if (selectedModels.includes(model)) {
-                    selectedModels = selectedModels.filter(m => m !== model);
-                    option.classList.remove('selected');
-                } else if (selectedModels.length < 2) {
-                    selectedModels.push(model);
-                    option.classList.add('selected');
-                }
-                updateStartButton();
-            });
-        });
-        
-        datasetOptions.forEach(option => {
-            option.addEventListener('click', () => {
-                datasetOptions.forEach(opt => opt.classList.remove('selected'));
-                option.classList.add('selected');
-                selectedDataset = option.dataset.dataset;
-                updateStartButton();
-            });
-        });
-        
-        metricOptions.forEach(option => {
-            option.addEventListener('click', () => {
-                const metric = option.dataset.metric;
-                if (selectedMetrics.includes(metric)) {
-                    selectedMetrics = selectedMetrics.filter(m => m !== metric);
-                    option.classList.remove('selected');
-                } else {
-                    selectedMetrics.push(metric);
-                    option.classList.add('selected');
-                }
-                updateStartButton();
-            });
-        });
-        
-        startBtn.addEventListener('click', startRealEvaluation);
-        
-        function updateStartButton() {
-            const canStart = selectedModels.length > 0 && selectedDataset && selectedMetrics.length > 0;
-            startBtn.disabled = !canStart;
+        function toggleModel(modelId, model) {{
+            const index = selectedModels.findIndex(m => m.id === modelId);
             
-            if (canStart) {
-                startBtn.textContent = `Run Real Evaluation: ${selectedModels.length} model(s) on ${selectedDataset}`;
-            } else {
-                startBtn.textContent = 'Select models, dataset, and metrics';
-            }
-        }
-        
-        async function startRealEvaluation() {
-            // Show progress section and hide results
-            progressSection.style.display = 'block';
-            resultsSection.style.display = 'none';
+            if (index > -1) {{
+                selectedModels.splice(index, 1);
+            }} else if (selectedModels.length < 5) {{
+                selectedModels.push({{id: modelId, ...model}});
+            }} else {{
+                alert('Maximum 5 models can be selected');
+                return;
+            }}
+            
+            updateModelSelection();
+        }}
+        
+        function updateModelSelection() {{
+            // Update visual selection
+            document.querySelectorAll('.model-card').forEach(card => {{
+                const modelId = card.dataset.modelId;
+                if (selectedModels.some(m => m.id === modelId)) {{
+                    card.classList.add('selected');
+                }} else {{
+                    card.classList.remove('selected');
+                }}
+            }});
             
-            // Reset progress
-            progressFill.style.width = '0%';
-            progressPercent.textContent = '0%';
-            progressText.textContent = 'Starting real evaluation...';
-            logsContainer.innerHTML = '<div class="log-line">🚀 Initiating real NovaEval evaluation...</div>';
+            // Update selected models list
+            const list = document.getElementById('selected-models-list');
+            list.innerHTML = selectedModels.map(model => 
+                `<span class="bg-green-100 text-green-800 px-3 py-1 rounded-full text-sm">${{model.name}}</span>`
+            ).join('');
+        }}
+        
+        function selectDataset(datasetId, dataset) {{
+            selectedDataset = datasetId;
             
-            // Disable start button
-            startBtn.disabled = true;
-            startBtn.textContent = 'Evaluation Running...';
+            // Update visual selection
+            document.querySelectorAll('.dataset-card').forEach(card => {{
+                if (card.dataset.datasetId === datasetId) {{
+                    card.classList.add('selected');
+                }} else {{
+                    card.classList.remove('selected');
+                }}
+            }});
             
-            try {
-                const response = await fetch('/api/evaluate', {
-                    method: 'POST',
-                    headers: {
-                        'Content-Type': 'application/json',
-                    },
-                    body: JSON.stringify({
-                        models: selectedModels,
-                        dataset: selectedDataset,
-                        metrics: selectedMetrics,
-                        num_samples: 10
-                    })
-                });
+            // Show subsets if available
+            if (dataset.subsets && dataset.subsets.length > 0) {{
+                const subsetsDiv = document.getElementById('dataset-subsets');
+                const select = document.getElementById('subset-select');
                 
-                const result = await response.json();
-                currentEvaluationId = result.evaluation_id;
+                select.innerHTML = '<option value="">All subsets</option>' + 
+                    dataset.subsets.map(subset => `<option value="${{subset}}">${{subset}}</option>`).join('');
                 
-                addLogLine(`✅ Evaluation started with ID: ${currentEvaluationId}`);
+                subsetsDiv.classList.remove('hidden');
                 
-            } catch (error) {
-                console.error('Error starting evaluation:', error);
-                addLogLine('❌ Failed to start evaluation: ' + error.message);
-                startBtn.disabled = false;
-                updateStartButton();
-            }
-        }
+                select.addEventListener('change', function() {{
+                    selectedDatasetSubset = this.value || null;
+                }});
+            }} else {{
+                document.getElementById('dataset-subsets').classList.add('hidden');
+                selectedDatasetSubset = null;
+            }}
+        }}
+        
+        function toggleMetric(metricId, metric) {{
+            const index = selectedMetrics.findIndex(m => m.id === metricId);
+            
+            if (index > -1) {{
+                selectedMetrics.splice(index, 1);
+            }} else {{
+                selectedMetrics.push({{id: metricId, ...metric}});
+            }}
+            
+            updateMetricSelection();
+        }}
+        
+        function updateMetricSelection() {{
+            // Update visual selection
+            document.querySelectorAll('.metric-card').forEach(card => {{
+                const metricId = card.dataset.metricId;
+                if (selectedMetrics.some(m => m.id === metricId)) {{
+                    card.classList.add('selected');
+                }} else {{
+                    card.classList.remove('selected');
+                }}
+            }});
+            
+            // Update selected metrics list
+            const list = document.getElementById('selected-metrics-list');
+            list.innerHTML = selectedMetrics.map(metric => 
+                `<span class="bg-purple-100 text-purple-800 px-3 py-1 rounded-full text-sm">${{metric.name}}</span>`
+            ).join('');
+        }}
+        
+        function switchTab(tabName) {{
+            currentTab = tabName;
+            
+            // Hide all tabs
+            document.querySelectorAll('.tab-content').forEach(tab => {{
+                tab.classList.add('hidden');
+            }});
+            
+            // Show selected tab
+            document.getElementById(tabName + '-tab').classList.remove('hidden');
+            
+            // Update tab buttons
+            document.querySelectorAll('.tab-button').forEach(btn => {{
+                btn.classList.remove('active');
+            }});
+            event.target.classList.add('active');
+            
+            updateNavigationButtons();
+        }}
         
-        function showResults(results) {
-            progressSection.style.display = 'none';
-            resultsSection.style.display = 'block';
+        function nextTab() {{
+            const tabs = ['models', 'datasets', 'metrics', 'config'];
+            const currentIndex = tabs.indexOf(currentTab);
             
-            // Display results
-            let resultsHTML = '<h4>Evaluation Summary</h4>';
+            if (currentIndex < tabs.length - 1) {{
+                const nextTab = tabs[currentIndex + 1];
+                document.querySelector(`[onclick="switchTab('${{nextTab}}')"]`).click();
+            }}
+        }}
+        
+        function previousTab() {{
+            const tabs = ['models', 'datasets', 'metrics', 'config'];
+            const currentIndex = tabs.indexOf(currentTab);
             
-            if (results.summary) {
-                resultsHTML += `
-                    <div class="result-card">
-                        <h5>Summary</h5>
-                        <p><strong>Models Evaluated:</strong> ${results.summary.total_models}</p>
-                        <p><strong>Metrics:</strong> ${results.summary.metrics_evaluated.join(', ')}</p>
-                    </div>
-                `;
-                
-                if (results.summary.best_performers) {
-                    resultsHTML += '<h4>Best Performers</h4>';
-                    Object.entries(results.summary.best_performers).forEach(([metric, data]) => {
-                        resultsHTML += `
-                            <div class="result-card">
-                                <h5>${metric.toUpperCase()}</h5>
-                                <p><strong>Best Model:</strong> ${data.model}</p>
-                                <span class="result-score">${(data.score * 100).toFixed(1)}%</span>
-                            </div>
-                        `;
-                    });
-                }
-            }
+            if (currentIndex > 0) {{
+                const prevTab = tabs[currentIndex - 1];
+                document.querySelector(`[onclick="switchTab('${{prevTab}}')"]`).click();
+            }}
+        }}
+        
+        function updateNavigationButtons() {{
+            const prevBtn = document.getElementById('prev-tab');
+            const nextBtn = document.getElementById('next-tab');
+            const startBtn = document.getElementById('start-evaluation');
+            
+            prevBtn.style.display = currentTab === 'models' ? 'none' : 'block';
+            
+            if (currentTab === 'config') {{
+                nextBtn.classList.add('hidden');
+                startBtn.classList.remove('hidden');
+            }} else {{
+                nextBtn.classList.remove('hidden');
+                startBtn.classList.add('hidden');
+            }}
+        }}
+        
+        function startEvaluation() {{
+            // Validate selections
+            if (selectedModels.length === 0) {{
+                alert('Please select at least one model');
+                return;
+            }}
+            
+            if (!selectedDataset) {{
+                alert('Please select a dataset');
+                return;
+            }}
+            
+            if (selectedMetrics.length === 0) {{
+                alert('Please select at least one metric');
+                return;
+            }}
+            
+            // Prepare evaluation request
+            const request = {{
+                models: selectedModels.map(m => m.id),
+                dataset: selectedDataset,
+                dataset_subset: selectedDatasetSubset,
+                metrics: selectedMetrics.map(m => m.id),
+                num_samples: parseInt(document.getElementById('num-samples').value),
+                temperature: parseFloat(document.getElementById('temperature').value),
+                max_tokens: parseInt(document.getElementById('max-tokens').value),
+                top_p: parseFloat(document.getElementById('top-p').value),
+                frequency_penalty: parseFloat(document.getElementById('frequency-penalty').value),
+                presence_penalty: parseFloat(document.getElementById('presence-penalty').value)
+            }};
             
-            resultsHTML += '<h4>Detailed Results</h4>';
-            Object.entries(results.models).forEach(([model, scores]) => {
-                const modelName = model.split('/').pop();
-                resultsHTML += `
-                    <div class="result-card">
-                        <h5>${modelName}</h5>
-                        ${Object.entries(scores).map(([metric, score]) => `
-                            <div style="display: flex; justify-content: space-between; margin: 10px 0;">
-                                <span>${metric.toUpperCase()}:</span>
-                                <span class="result-score">${(score * 100).toFixed(1)}%</span>
+            // Show evaluation section
+            document.getElementById('evaluation-section').classList.remove('hidden');
+            document.getElementById('results-section').classList.add('hidden');
+            
+            // Scroll to evaluation section
+            document.getElementById('evaluation-section').scrollIntoView({{ behavior: 'smooth' }});
+            
+            // Start evaluation
+            fetch('/api/evaluate', {{
+                method: 'POST',
+                headers: {{
+                    'Content-Type': 'application/json'
+                }},
+                body: JSON.stringify(request)
+            }})
+            .then(response => response.json())
+            .then(data => {{
+                if (data.status === 'started') {{
+                    connectWebSocket(data.evaluation_id);
+                }} else {{
+                    alert('Failed to start evaluation: ' + data.message);
+                }}
+            }})
+            .catch(error => {{
+                console.error('Error:', error);
+                alert('Failed to start evaluation');
+            }});
+        }}
+        
+        function connectWebSocket(evaluationId) {{
+            const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+            const wsUrl = `${{protocol}}//${{window.location.host}}/ws/${{evaluationId}}`;
+            
+            websocket = new WebSocket(wsUrl);
+            
+            websocket.onmessage = function(event) {{
+                const data = JSON.parse(event.data);
+                handleWebSocketMessage(data);
+            }};
+            
+            websocket.onclose = function() {{
+                console.log('WebSocket connection closed');
+            }};
+            
+            websocket.onerror = function(error) {{
+                console.error('WebSocket error:', error);
+            }};
+        }}
+        
+        function handleWebSocketMessage(data) {{
+            const logsContainer = document.getElementById('evaluation-logs');
+            const requestResponseContainer = document.getElementById('request-response-logs');
+            
+            switch (data.type) {{
+                case 'evaluation_start':
+                case 'evaluation_progress':
+                case 'model_start':
+                case 'model_complete':
+                    // Update progress
+                    if (data.progress !== undefined) {{
+                        document.getElementById('progress-bar').style.width = data.progress + '%';
+                        document.getElementById('progress-percentage').textContent = data.progress + '%';
+                    }}
+                    
+                    // Add log entry
+                    const logEntry = document.createElement('div');
+                    logEntry.className = 'log-entry mb-1';
+                    logEntry.textContent = data.message;
+                    logsContainer.appendChild(logEntry);
+                    logsContainer.scrollTop = logsContainer.scrollHeight;
+                    break;
+                    
+                case 'request_response':
+                    // Add request/response details
+                    const reqResEntry = document.createElement('div');
+                    reqResEntry.className = 'mb-4 p-3 bg-white rounded border';
+                    reqResEntry.innerHTML = `
+                        <div class="font-medium text-gray-800 mb-2">${{data.model}} - Sample ${{data.sample_index}}</div>
+                        <div class="text-sm space-y-2">
+                            <div>
+                                <span class="font-medium text-blue-600">Request:</span>
+                                <pre class="text-xs bg-gray-100 p-2 rounded mt-1">${{JSON.stringify(data.request, null, 2)}}</pre>
                             </div>
-                        `).join('')}
+                            <div>
+                                <span class="font-medium text-green-600">Response:</span>
+                                <pre class="text-xs bg-gray-100 p-2 rounded mt-1">${{JSON.stringify(data.response, null, 2)}}</pre>
+                            </div>
+                        </div>
+                    `;
+                    requestResponseContainer.appendChild(reqResEntry);
+                    requestResponseContainer.scrollTop = requestResponseContainer.scrollHeight;
+                    break;
+                    
+                case 'evaluation_complete':
+                    // Update progress to 100%
+                    document.getElementById('progress-bar').style.width = '100%';
+                    document.getElementById('progress-percentage').textContent = '100%';
+                    document.getElementById('evaluation-status').textContent = 'Completed';
+                    document.getElementById('evaluation-status').className = 'px-4 py-2 bg-green-100 text-green-800 rounded-lg font-medium';
+                    
+                    // Show results
+                    evaluationResults = data;
+                    displayResults(data.results, data.summary);
+                    break;
+                    
+                case 'evaluation_error':
+                    document.getElementById('evaluation-status').textContent = 'Failed';
+                    document.getElementById('evaluation-status').className = 'px-4 py-2 bg-red-100 text-red-800 rounded-lg font-medium';
+                    
+                    const errorEntry = document.createElement('div');
+                    errorEntry.className = 'log-entry mb-1 text-red-400';
+                    errorEntry.textContent = data.message;
+                    logsContainer.appendChild(errorEntry);
+                    break;
+            }}
+        }}
+        
+        function displayResults(results, summary) {{
+            document.getElementById('results-section').classList.remove('hidden');
+            document.getElementById('results-section').scrollIntoView({{ behavior: 'smooth' }});
+            
+            const container = document.getElementById('results-content');
+            
+            // Summary section
+            const summaryHtml = `
+                <div class="grid md:grid-cols-4 gap-4 mb-8">
+                    <div class="bg-blue-50 p-4 rounded-lg">
+                        <div class="text-2xl font-bold text-blue-600">${{summary.models_evaluated}}</div>
+                        <div class="text-sm text-blue-800">Models Evaluated</div>
                     </div>
-                `;
-            });
+                    <div class="bg-green-50 p-4 rounded-lg">
+                        <div class="text-2xl font-bold text-green-600">${{summary.total_samples}}</div>
+                        <div class="text-sm text-green-800">Total Samples</div>
+                    </div>
+                    <div class="bg-purple-50 p-4 rounded-lg">
+                        <div class="text-2xl font-bold text-purple-600">$${{summary.total_cost_usd}}</div>
+                        <div class="text-sm text-purple-800">Total Cost</div>
+                    </div>
+                    <div class="bg-orange-50 p-4 rounded-lg">
+                        <div class="text-2xl font-bold text-orange-600">${{summary.avg_latency_ms}}ms</div>
+                        <div class="text-sm text-orange-800">Avg Latency</div>
+                    </div>
+                </div>
+            `;
             
-            resultsContainer.innerHTML = resultsHTML;
+            // Results table
+            const modelsArray = Object.entries(results);
+            const metricsArray = Object.keys(modelsArray[0][1].metrics);
             
-            // Re-enable start button
-            startBtn.disabled = false;
-            updateStartButton();
-        }
-        
-        // Initialize
-        updateStartButton();
-        initWebSocket();
+            const tableHtml = `
+                <div class="overflow-x-auto">
+                    <table class="w-full border-collapse border border-gray-300">
+                        <thead>
+                            <tr class="bg-gray-50">
+                                <th class="border border-gray-300 px-4 py-2 text-left">Model</th>
+                                ${{metricsArray.map(metric => `<th class="border border-gray-300 px-4 py-2 text-center">${{METRICS[metric].name}}</th>`).join('')}}
+                                <th class="border border-gray-300 px-4 py-2 text-center">Cost</th>
+                                <th class="border border-gray-300 px-4 py-2 text-center">Latency</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            ${{modelsArray.map(([modelId, result]) => `
+                                <tr class="hover:bg-gray-50">
+                                    <td class="border border-gray-300 px-4 py-2 font-medium">${{result.model_info.name}}</td>
+                                    ${{metricsArray.map(metric => `
+                                        <td class="border border-gray-300 px-4 py-2 text-center">
+                                            <span class="font-mono">${{result.metrics[metric].score}}</span>
+                                        </td>
+                                    `).join('')}}
+                                    <td class="border border-gray-300 px-4 py-2 text-center">$${{result.total_cost.toFixed(4)}}</td>
+                                    <td class="border border-gray-300 px-4 py-2 text-center">${{result.avg_latency_ms}}ms</td>
+                                </tr>
+                            `).join('')}}
+                        </tbody>
+                    </table>
+                </div>
+            `;
+            
+            container.innerHTML = summaryHtml + tableHtml;
+        }}
+        
+        function exportResults(format) {{
+            if (!evaluationResults) {{
+                alert('No results to export');
+                return;
+            }}
+            
+            let content, filename, mimeType;
+            
+            if (format === 'json') {{
+                content = JSON.stringify(evaluationResults, null, 2);
+                filename = 'novaeval_results.json';
+                mimeType = 'application/json';
+            }} else if (format === 'csv') {{
+                // Convert to CSV
+                const results = evaluationResults.results;
+                const headers = ['Model', 'Provider'];
+                const metricsArray = Object.keys(Object.values(results)[0].metrics);
+                headers.push(...metricsArray, 'Total Cost', 'Avg Latency (ms)');
+                
+                const rows = [headers];
+                Object.entries(results).forEach(([modelId, result]) => {{
+                    const row = [
+                        result.model_info.name,
+                        result.model_info.provider,
+                        ...metricsArray.map(metric => result.metrics[metric].score),
+                        result.total_cost.toFixed(4),
+                        result.avg_latency_ms
+                    ];
+                    rows.push(row);
+                }});
+                
+                content = rows.map(row => row.join(',')).join('\\n');
+                filename = 'novaeval_results.csv';
+                mimeType = 'text/csv';
+            }}
+            
+            const blob = new Blob([content], {{ type: mimeType }});
+            const url = URL.createObjectURL(blob);
+            const a = document.createElement('a');
+            a.href = url;
+            a.download = filename;
+            document.body.appendChild(a);
+            a.click();
+            document.body.removeChild(a);
+            URL.revokeObjectURL(url);
+        }}
     </script>
 </body>
 </html>
-    """
-    return HTMLResponse(content=html_content)
+    """)
 
-@app.websocket("/ws/{client_id}")
-async def websocket_endpoint(websocket: WebSocket, client_id: str):
-    """WebSocket endpoint for real-time updates"""
-    await manager.connect(websocket, client_id)
-    try:
-        while True:
-            # Keep connection alive
-            await websocket.receive_text()
-    except WebSocketDisconnect:
-        manager.disconnect(client_id)
+@app.get("/api/models")
+async def get_models():
+    """Get all supported models"""
+    return SUPPORTED_MODELS
+
+@app.get("/api/datasets")
+async def get_datasets():
+    """Get all supported datasets"""
+    return SUPPORTED_DATASETS
+
+@app.get("/api/metrics")
+async def get_metrics():
+    """Get all supported metrics"""
+    return SUPPORTED_METRICS
 
-@app.post("/api/evaluate", response_model=EvaluationResponse)
-async def start_evaluation(request: EvaluationRequest, background_tasks: BackgroundTasks):
-    """Start a real evaluation"""
+@app.post("/api/evaluate")
+async def start_evaluation(request: EvaluationRequest):
+    """Start a new evaluation"""
     evaluation_id = str(uuid.uuid4())
     
-    logger.info(f"Starting evaluation {evaluation_id} with models: {request.models}")
+    # Store evaluation info
+    active_evaluations[evaluation_id] = {
+        "id": evaluation_id,
+        "request": request.dict(),
+        "status": "starting",
+        "created_at": datetime.now().isoformat()
+    }
     
     # Start evaluation in background
-    background_tasks.add_task(
-        run_real_evaluation,
-        evaluation_id,
-        request.models,
-        request.dataset,
-        request.metrics,
-        request.num_samples
-    )
+    asyncio.create_task(simulate_novaeval_evaluation(evaluation_id, request))
+    
+    logger.info(f"Started evaluation {evaluation_id} with {len(request.models)} models")
     
     return EvaluationResponse(
         evaluation_id=evaluation_id,
         status="started",
-        message="Real evaluation started successfully"
+        message="Evaluation started successfully"
     )
 
-@app.get("/api/evaluation/{evaluation_id}")
-async def get_evaluation_status(evaluation_id: str):
-    """Get evaluation status"""
-    if evaluation_id in active_evaluations:
-        return active_evaluations[evaluation_id]
-    else:
+@app.get("/api/evaluations/{evaluation_id}")
+async def get_evaluation(evaluation_id: str):
+    """Get evaluation status and results"""
+    if evaluation_id not in active_evaluations:
         raise HTTPException(status_code=404, detail="Evaluation not found")
+    
+    return active_evaluations[evaluation_id]
+
+@app.websocket("/ws/{evaluation_id}")
+async def websocket_endpoint(websocket: WebSocket, evaluation_id: str):
+    """WebSocket endpoint for real-time evaluation updates"""
+    await websocket.accept()
+    websocket_connections.append(websocket)
+    
+    try:
+        while True:
+            # Keep connection alive
+            await asyncio.sleep(1)
+    except WebSocketDisconnect:
+        websocket_connections.remove(websocket)
 
 @app.get("/api/health")
 async def health_check():
     """Health check endpoint"""
     return {
         "status": "healthy",
-        "service": "novaeval-space-real",
-        "version": "2.0.0",
-        "features": ["real_evaluations", "live_logs", "websocket_updates"]
+        "timestamp": datetime.now().isoformat(),
+        "version": "1.0.0",
+        "active_evaluations": len(active_evaluations)
     }
 
 if __name__ == "__main__":
     port = int(os.getenv("PORT", 7860))
-    logger.info(f"Starting Real NovaEval Space on port {port}")
-    uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)
+    logger.info(f"Starting Comprehensive NovaEval Space on port {port}")
+    uvicorn.run(app, host="0.0.0.0", port=port, reload=False)