Spaces:
Sleeping
Sleeping
shashank_test
#1
by
shashankagar
- opened
- Dockerfile +0 -31
- README.md +433 -158
- app.py +0 -1447
- fixed-novaeval-space.zip +3 -0
- novaeval-space-deployment.zip +3 -0
- package.json +39 -0
- requirements.txt +0 -6
Dockerfile
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
FROM python:3.11-slim
|
2 |
-
|
3 |
-
# Set working directory
|
4 |
-
WORKDIR /app
|
5 |
-
|
6 |
-
# Install system dependencies
|
7 |
-
RUN apt-get update && apt-get install -y \
|
8 |
-
curl \
|
9 |
-
&& rm -rf /var/lib/apt/lists/*
|
10 |
-
|
11 |
-
# Copy requirements and install Python dependencies
|
12 |
-
COPY requirements.txt .
|
13 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
14 |
-
|
15 |
-
# Copy application code
|
16 |
-
COPY app.py app.py
|
17 |
-
|
18 |
-
# Create non-root user for security
|
19 |
-
RUN useradd -m -u 1000 user
|
20 |
-
USER user
|
21 |
-
|
22 |
-
# Expose port
|
23 |
-
EXPOSE 7860
|
24 |
-
|
25 |
-
# Health check
|
26 |
-
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
27 |
-
CMD curl -f http://localhost:7860/api/health || exit 1
|
28 |
-
|
29 |
-
# Run the application
|
30 |
-
CMD ["python", "app.py"]
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,191 +1,466 @@
|
|
1 |
---
|
2 |
-
title: NovaEval
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
pinned: false
|
|
|
|
|
|
|
|
|
8 |
---
|
9 |
-
|
10 |
# NovaEval by Noveum.ai
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
## 🚀 Features
|
15 |
|
16 |
-
|
17 |
-
- **
|
18 |
-
- **
|
19 |
-
- **
|
20 |
-
- **
|
21 |
-
|
22 |
-
|
23 |
-
- **
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
```bash
|
113 |
# Install dependencies
|
114 |
-
pip install -
|
115 |
|
116 |
-
# Run
|
117 |
-
|
118 |
|
119 |
-
#
|
|
|
120 |
```
|
121 |
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
###
|
125 |
-
- **Sample Size**: 10-1000 samples
|
126 |
-
- **Temperature**: 0.0-2.0 (creativity control)
|
127 |
-
- **Max Tokens**: 128-2048 (response length)
|
128 |
-
- **Top-p**: 0.9 (nucleus sampling)
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
- **Real-time Monitoring**: Watch evaluations progress live
|
134 |
-
- **Export Results**: Download results in JSON format
|
135 |
|
136 |
-
|
|
|
|
|
137 |
|
138 |
-
|
139 |
-
1. **Select Models** - Choose from 15+ Hugging Face models
|
140 |
-
2. **Pick Dataset** - Select from 11 evaluation datasets
|
141 |
-
3. **Configure Metrics** - Choose relevant evaluation metrics
|
142 |
-
4. **Set Parameters** - Adjust sample size, temperature, etc.
|
143 |
-
5. **Start Evaluation** - Watch real-time progress and logs
|
144 |
-
6. **View Results** - Analyze performance comparisons
|
145 |
|
146 |
-
|
147 |
-
- **Model Search** - Find models by name or provider
|
148 |
-
- **Category Filtering** - Filter by model size or dataset type
|
149 |
-
- **Real-time Logs** - See actual evaluation steps
|
150 |
-
- **Progress Tracking** - Visual progress bars and percentages
|
151 |
-
- **Interactive Results** - Compare models side-by-side
|
152 |
|
153 |
-
|
|
|
|
|
|
|
154 |
|
155 |
-
###
|
156 |
-
- **Comprehensive Benchmarking** across multiple models and datasets
|
157 |
-
- **Standardized Evaluation** with consistent metrics and procedures
|
158 |
-
- **Real-time Monitoring** to track evaluation progress
|
159 |
-
- **Export Capabilities** for further analysis
|
160 |
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
-
|
165 |
-
-
|
|
|
166 |
|
167 |
-
###
|
168 |
-
- **Collaborative Evaluation** with shareable results
|
169 |
-
- **Professional Interface** suitable for presentations
|
170 |
-
- **Comprehensive Documentation** for easy onboarding
|
171 |
-
- **Open Source** with full customization capabilities
|
172 |
|
173 |
-
|
|
|
|
|
|
|
174 |
|
175 |
-
|
176 |
-
- **NovaEval Framework**: [https://github.com/Noveum/NovaEval](https://github.com/Noveum/NovaEval)
|
177 |
-
- **Hugging Face Models**: [https://huggingface.co/models](https://huggingface.co/models)
|
178 |
-
- **Documentation**: Available in the application interface
|
179 |
|
180 |
-
|
181 |
|
182 |
-
|
|
|
|
|
|
|
|
|
183 |
|
184 |
-
|
185 |
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
|
|
|
1 |
---
|
2 |
+
title: NovaEval
|
3 |
+
emoji: 🐠
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: red
|
6 |
+
sdk: static
|
7 |
pinned: false
|
8 |
+
app_build_command: npm run build
|
9 |
+
app_file: build/index.html
|
10 |
+
license: apache-2.0
|
11 |
+
short_description: A comprehensive AI model evaluation framework.
|
12 |
---
|
|
|
13 |
# NovaEval by Noveum.ai
|
14 |
|
15 |
+
[](https://github.com/Noveum/NovaEval/actions/workflows/ci.yml)
|
16 |
+
[](https://github.com/Noveum/NovaEval/actions/workflows/release.yml)
|
17 |
+
[](https://codecov.io/gh/Noveum/NovaEval)
|
18 |
+
[](https://badge.fury.io/py/novaeval)
|
19 |
+
[](https://www.python.org/downloads/)
|
20 |
+
[](https://opensource.org/licenses/Apache-2.0)
|
21 |
+
|
22 |
+
A comprehensive, extensible AI model evaluation framework designed for production use. NovaEval provides a unified interface for evaluating language models across various datasets, metrics, and deployment scenarios.
|
23 |
+
|
24 |
+
## 🚧 Development Status
|
25 |
+
|
26 |
+
> **⚠️ ACTIVE DEVELOPMENT - NOT PRODUCTION READY**
|
27 |
+
>
|
28 |
+
> NovaEval is currently in active development and **not recommended for production use**. We are actively working on improving stability, adding features, and expanding test coverage. APIs may change without notice.
|
29 |
+
>
|
30 |
+
> **We're looking for contributors!** See the [Contributing](#-contributing) section below for ways to help.
|
31 |
+
|
32 |
+
## 🤝 We Need Your Help!
|
33 |
+
|
34 |
+
NovaEval is an open-source project that thrives on community contributions. Whether you're a seasoned developer or just getting started, there are many ways to contribute:
|
35 |
+
|
36 |
+
### 🎯 High-Priority Contribution Areas
|
37 |
+
|
38 |
+
We're actively looking for contributors in these key areas:
|
39 |
+
|
40 |
+
- **🧪 Unit Tests**: Help us improve our test coverage (currently 23% overall, 90%+ for core modules)
|
41 |
+
- **📚 Examples**: Create real-world evaluation examples and use cases
|
42 |
+
- **📝 Guides & Notebooks**: Write evaluation guides and interactive Jupyter notebooks
|
43 |
+
- **📖 Documentation**: Improve API documentation and user guides
|
44 |
+
- **🔍 RAG Metrics**: Add more metrics specifically for Retrieval-Augmented Generation evaluation
|
45 |
+
- **🤖 Agent Evaluation**: Build frameworks for evaluating AI agents and multi-turn conversations
|
46 |
+
|
47 |
+
### 🚀 Getting Started as a Contributor
|
48 |
+
|
49 |
+
1. **Start Small**: Pick up issues labeled `good first issue` or `help wanted`
|
50 |
+
2. **Join Discussions**: Share your ideas in [GitHub Discussions](https://github.com/Noveum/NovaEval/discussions)
|
51 |
+
3. **Review Code**: Help review pull requests and provide feedback
|
52 |
+
4. **Report Issues**: Found a bug? Report it in [GitHub Issues](https://github.com/Noveum/NovaEval/issues)
|
53 |
+
5. **Spread the Word**: Star the repository and share with your network
|
54 |
|
55 |
## 🚀 Features
|
56 |
|
57 |
+
- **Multi-Model Support**: Evaluate models from OpenAI, Anthropic, AWS Bedrock, and custom providers
|
58 |
+
- **Extensible Scoring**: Built-in scorers for accuracy, semantic similarity, code evaluation, and custom metrics
|
59 |
+
- **Dataset Integration**: Support for MMLU, HuggingFace datasets, custom datasets, and more
|
60 |
+
- **Production Ready**: Docker support, Kubernetes deployment, and cloud integrations
|
61 |
+
- **Comprehensive Reporting**: Detailed evaluation reports, artifacts, and visualizations
|
62 |
+
- **Secure**: Built-in credential management and secret store integration
|
63 |
+
- **Scalable**: Designed for both local testing and large-scale production evaluations
|
64 |
+
- **Cross-Platform**: Tested on macOS, Linux, and Windows with comprehensive CI/CD
|
65 |
+
|
66 |
+
## 📦 Installation
|
67 |
+
|
68 |
+
### From PyPI (Recommended)
|
69 |
+
|
70 |
+
```bash
|
71 |
+
pip install novaeval
|
72 |
+
```
|
73 |
+
|
74 |
+
### From Source
|
75 |
+
|
76 |
+
```bash
|
77 |
+
git clone https://github.com/Noveum/NovaEval.git
|
78 |
+
cd NovaEval
|
79 |
+
pip install -e .
|
80 |
+
```
|
81 |
+
|
82 |
+
### Docker
|
83 |
+
|
84 |
+
```bash
|
85 |
+
docker pull noveum/novaeval:latest
|
86 |
+
```
|
87 |
+
|
88 |
+
## 🏃♂️ Quick Start
|
89 |
+
|
90 |
+
### Basic Evaluation
|
91 |
+
|
92 |
+
```python
|
93 |
+
from novaeval import Evaluator
|
94 |
+
from novaeval.datasets import MMLUDataset
|
95 |
+
from novaeval.models import OpenAIModel
|
96 |
+
from novaeval.scorers import AccuracyScorer
|
97 |
+
|
98 |
+
# Configure for cost-conscious evaluation
|
99 |
+
MAX_TOKENS = 100 # Adjust based on budget: 5-10 for answers, 100+ for reasoning
|
100 |
+
|
101 |
+
# Initialize components
|
102 |
+
dataset = MMLUDataset(
|
103 |
+
subset="elementary_mathematics", # Easier subset for demo
|
104 |
+
num_samples=10,
|
105 |
+
split="test"
|
106 |
+
)
|
107 |
+
|
108 |
+
model = OpenAIModel(
|
109 |
+
model_name="gpt-4o-mini", # Cost-effective model
|
110 |
+
temperature=0.0,
|
111 |
+
max_tokens=MAX_TOKENS
|
112 |
+
)
|
113 |
+
|
114 |
+
scorer = AccuracyScorer(extract_answer=True)
|
115 |
+
|
116 |
+
# Create and run evaluation
|
117 |
+
evaluator = Evaluator(
|
118 |
+
dataset=dataset,
|
119 |
+
models=[model],
|
120 |
+
scorers=[scorer],
|
121 |
+
output_dir="./results"
|
122 |
+
)
|
123 |
+
|
124 |
+
results = evaluator.run()
|
125 |
+
|
126 |
+
# Display detailed results
|
127 |
+
for model_name, model_results in results["model_results"].items():
|
128 |
+
for scorer_name, score_info in model_results["scores"].items():
|
129 |
+
if isinstance(score_info, dict):
|
130 |
+
mean_score = score_info.get("mean", 0)
|
131 |
+
count = score_info.get("count", 0)
|
132 |
+
print(f"{scorer_name}: {mean_score:.4f} ({count} samples)")
|
133 |
+
```
|
134 |
+
|
135 |
+
### Configuration-Based Evaluation
|
136 |
+
|
137 |
+
```python
|
138 |
+
from novaeval import Evaluator
|
139 |
+
|
140 |
+
# Load configuration from YAML/JSON
|
141 |
+
evaluator = Evaluator.from_config("evaluation_config.yaml")
|
142 |
+
results = evaluator.run()
|
143 |
+
```
|
144 |
+
|
145 |
+
### Command Line Interface
|
146 |
+
|
147 |
+
NovaEval provides a comprehensive CLI for running evaluations:
|
148 |
+
|
149 |
+
```bash
|
150 |
+
# Run evaluation from configuration file
|
151 |
+
novaeval run config.yaml
|
152 |
+
|
153 |
+
# Quick evaluation with minimal setup
|
154 |
+
novaeval quick -d mmlu -m gpt-4 -s accuracy
|
155 |
+
|
156 |
+
# List available datasets, models, and scorers
|
157 |
+
novaeval list-datasets
|
158 |
+
novaeval list-models
|
159 |
+
novaeval list-scorers
|
160 |
+
|
161 |
+
# Generate sample configuration
|
162 |
+
novaeval generate-config sample-config.yaml
|
163 |
+
```
|
164 |
+
|
165 |
+
📖 **[Complete CLI Reference](docs/cli-reference.md)** - Detailed documentation for all CLI commands and options
|
166 |
+
|
167 |
+
### Example Configuration
|
168 |
+
|
169 |
+
```yaml
|
170 |
+
# evaluation_config.yaml
|
171 |
+
dataset:
|
172 |
+
type: "mmlu"
|
173 |
+
subset: "abstract_algebra"
|
174 |
+
num_samples: 500
|
175 |
+
|
176 |
+
models:
|
177 |
+
- type: "openai"
|
178 |
+
model_name: "gpt-4"
|
179 |
+
temperature: 0.0
|
180 |
+
- type: "anthropic"
|
181 |
+
model_name: "claude-3-opus"
|
182 |
+
temperature: 0.0
|
183 |
+
|
184 |
+
scorers:
|
185 |
+
- type: "accuracy"
|
186 |
+
- type: "semantic_similarity"
|
187 |
+
threshold: 0.8
|
188 |
+
|
189 |
+
output:
|
190 |
+
directory: "./results"
|
191 |
+
formats: ["json", "csv", "html"]
|
192 |
+
upload_to_s3: true
|
193 |
+
s3_bucket: "my-eval-results"
|
194 |
+
```
|
195 |
+
|
196 |
+
## 🏗️ Architecture
|
197 |
+
|
198 |
+
NovaEval is built with extensibility and modularity in mind:
|
199 |
+
|
200 |
+
```
|
201 |
+
src/novaeval/
|
202 |
+
├── datasets/ # Dataset loaders and processors
|
203 |
+
├── evaluators/ # Core evaluation logic
|
204 |
+
├── integrations/ # External service integrations
|
205 |
+
├── models/ # Model interfaces and adapters
|
206 |
+
├── reporting/ # Report generation and visualization
|
207 |
+
├── scorers/ # Scoring mechanisms and metrics
|
208 |
+
└── utils/ # Utility functions and helpers
|
209 |
+
```
|
210 |
+
|
211 |
+
### Core Components
|
212 |
+
|
213 |
+
- **Datasets**: Standardized interface for loading evaluation datasets
|
214 |
+
- **Models**: Unified API for different AI model providers
|
215 |
+
- **Scorers**: Pluggable scoring mechanisms for various evaluation metrics
|
216 |
+
- **Evaluators**: Orchestrates the evaluation process
|
217 |
+
- **Reporting**: Generates comprehensive reports and artifacts
|
218 |
+
- **Integrations**: Handles external services (S3, credential stores, etc.)
|
219 |
+
|
220 |
+
## 📊 Supported Datasets
|
221 |
+
|
222 |
+
- **MMLU**: Massive Multitask Language Understanding
|
223 |
+
- **HuggingFace**: Any dataset from the HuggingFace Hub
|
224 |
+
- **Custom**: JSON, CSV, or programmatic dataset definitions
|
225 |
+
- **Code Evaluation**: Programming benchmarks and code generation tasks
|
226 |
+
- **Agent Traces**: Multi-turn conversation and agent evaluation
|
227 |
+
|
228 |
+
## 🤖 Supported Models
|
229 |
+
|
230 |
+
- **OpenAI**: GPT-3.5, GPT-4, and newer models
|
231 |
+
- **Anthropic**: Claude family models
|
232 |
+
- **AWS Bedrock**: Amazon's managed AI services
|
233 |
+
- **Noveum AI Gateway**: Integration with Noveum's model gateway
|
234 |
+
- **Custom**: Extensible interface for any API-based model
|
235 |
+
|
236 |
+
## 📏 Built-in Scorers
|
237 |
+
|
238 |
+
### Accuracy-Based
|
239 |
+
- **ExactMatch**: Exact string matching
|
240 |
+
- **Accuracy**: Classification accuracy
|
241 |
+
- **F1Score**: F1 score for classification tasks
|
242 |
+
|
243 |
+
### Semantic-Based
|
244 |
+
- **SemanticSimilarity**: Embedding-based similarity scoring
|
245 |
+
- **BERTScore**: BERT-based semantic evaluation
|
246 |
+
- **RougeScore**: ROUGE metrics for text generation
|
247 |
+
|
248 |
+
### Code-Specific
|
249 |
+
- **CodeExecution**: Execute and validate code outputs
|
250 |
+
- **SyntaxChecker**: Validate code syntax
|
251 |
+
- **TestCoverage**: Code coverage analysis
|
252 |
+
|
253 |
+
### Custom
|
254 |
+
- **LLMJudge**: Use another LLM as a judge
|
255 |
+
- **HumanEval**: Integration with human evaluation workflows
|
256 |
+
|
257 |
+
## 🚀 Deployment
|
258 |
+
|
259 |
+
### Local Development
|
260 |
|
261 |
```bash
|
262 |
# Install dependencies
|
263 |
+
pip install -e ".[dev]"
|
264 |
|
265 |
+
# Run tests
|
266 |
+
pytest
|
267 |
|
268 |
+
# Run example evaluation
|
269 |
+
python examples/basic_evaluation.py
|
270 |
```
|
271 |
|
272 |
+
### Docker
|
273 |
+
|
274 |
+
```bash
|
275 |
+
# Build image
|
276 |
+
docker build -t nova-eval .
|
277 |
+
|
278 |
+
# Run evaluation
|
279 |
+
docker run -v $(pwd)/config:/config -v $(pwd)/results:/results nova-eval --config /config/eval.yaml
|
280 |
+
```
|
281 |
|
282 |
+
### Kubernetes
|
|
|
|
|
|
|
|
|
283 |
|
284 |
+
```bash
|
285 |
+
# Deploy to Kubernetes
|
286 |
+
kubectl apply -f kubernetes/
|
|
|
|
|
287 |
|
288 |
+
# Check status
|
289 |
+
kubectl get pods -l app=nova-eval
|
290 |
+
```
|
291 |
|
292 |
+
## 🔧 Configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
|
294 |
+
NovaEval supports configuration through:
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
+
- **YAML/JSON files**: Declarative configuration
|
297 |
+
- **Environment variables**: Runtime configuration
|
298 |
+
- **Python code**: Programmatic configuration
|
299 |
+
- **CLI arguments**: Command-line overrides
|
300 |
|
301 |
+
### Environment Variables
|
|
|
|
|
|
|
|
|
302 |
|
303 |
+
```bash
|
304 |
+
export NOVA_EVAL_OUTPUT_DIR="./results"
|
305 |
+
export NOVA_EVAL_LOG_LEVEL="INFO"
|
306 |
+
export OPENAI_API_KEY="your-api-key"
|
307 |
+
export AWS_ACCESS_KEY_ID="your-aws-key"
|
308 |
+
```
|
309 |
|
310 |
+
### CI/CD Integration
|
|
|
|
|
|
|
|
|
311 |
|
312 |
+
NovaEval includes optimized GitHub Actions workflows:
|
313 |
+
- **Unit tests** run on all PRs and pushes for quick feedback
|
314 |
+
- **Integration tests** run on main branch only to minimize API costs
|
315 |
+
- **Cross-platform testing** on macOS, Linux, and Windows
|
316 |
|
317 |
+
## 📈 Reporting and Artifacts
|
|
|
|
|
|
|
318 |
|
319 |
+
NovaEval generates comprehensive evaluation reports:
|
320 |
|
321 |
+
- **Summary Reports**: High-level metrics and insights
|
322 |
+
- **Detailed Results**: Per-sample predictions and scores
|
323 |
+
- **Visualizations**: Charts and graphs for result analysis
|
324 |
+
- **Artifacts**: Model outputs, intermediate results, and debug information
|
325 |
+
- **Export Formats**: JSON, CSV, HTML, PDF
|
326 |
|
327 |
+
### Example Report Structure
|
328 |
|
329 |
+
```
|
330 |
+
results/
|
331 |
+
├── summary.json # High-level metrics
|
332 |
+
├── detailed_results.csv # Per-sample results
|
333 |
+
├── artifacts/
|
334 |
+
│ ├── model_outputs/ # Raw model responses
|
335 |
+
│ ├── intermediate/ # Processing artifacts
|
336 |
+
│ └── debug/ # Debug information
|
337 |
+
├── visualizations/
|
338 |
+
│ ├── accuracy_by_category.png
|
339 |
+
│ ├── score_distribution.png
|
340 |
+
│ └── confusion_matrix.png
|
341 |
+
└── report.html # Interactive HTML report
|
342 |
+
```
|
343 |
|
344 |
+
## 🔌 Extending NovaEval
|
345 |
+
|
346 |
+
### Custom Datasets
|
347 |
+
|
348 |
+
```python
|
349 |
+
from novaeval.datasets import BaseDataset
|
350 |
+
|
351 |
+
class MyCustomDataset(BaseDataset):
|
352 |
+
def load_data(self):
|
353 |
+
# Implement data loading logic
|
354 |
+
return samples
|
355 |
+
|
356 |
+
def get_sample(self, index):
|
357 |
+
# Return individual sample
|
358 |
+
return sample
|
359 |
+
```
|
360 |
+
|
361 |
+
### Custom Scorers
|
362 |
+
|
363 |
+
```python
|
364 |
+
from novaeval.scorers import BaseScorer
|
365 |
+
|
366 |
+
class MyCustomScorer(BaseScorer):
|
367 |
+
def score(self, prediction, ground_truth, context=None):
|
368 |
+
# Implement scoring logic
|
369 |
+
return score
|
370 |
+
```
|
371 |
+
|
372 |
+
### Custom Models
|
373 |
+
|
374 |
+
```python
|
375 |
+
from novaeval.models import BaseModel
|
376 |
+
|
377 |
+
class MyCustomModel(BaseModel):
|
378 |
+
def generate(self, prompt, **kwargs):
|
379 |
+
# Implement model inference
|
380 |
+
return response
|
381 |
+
```
|
382 |
|
383 |
+
## 🤝 Contributing
|
384 |
+
|
385 |
+
We welcome contributions! NovaEval is actively seeking contributors to help build a robust AI evaluation framework. Please see our [Contributing Guide](CONTRIBUTING.md) for detailed guidelines.
|
386 |
+
|
387 |
+
### 🎯 Priority Contribution Areas
|
388 |
+
|
389 |
+
As mentioned in the [We Need Your Help](#-we-need-your-help) section, we're particularly looking for help with:
|
390 |
+
|
391 |
+
1. **Unit Tests** - Expand test coverage beyond the current 23%
|
392 |
+
2. **Examples** - Real-world evaluation scenarios and use cases
|
393 |
+
3. **Guides & Notebooks** - Interactive evaluation tutorials
|
394 |
+
4. **Documentation** - API docs, user guides, and tutorials
|
395 |
+
5. **RAG Metrics** - Specialized metrics for retrieval-augmented generation
|
396 |
+
6. **Agent Evaluation** - Frameworks for multi-turn and agent-based evaluations
|
397 |
+
|
398 |
+
### Development Setup
|
399 |
+
|
400 |
+
```bash
|
401 |
+
# Clone repository
|
402 |
+
git clone https://github.com/Noveum/NovaEval.git
|
403 |
+
cd NovaEval
|
404 |
+
|
405 |
+
# Create virtual environment
|
406 |
+
python -m venv venv
|
407 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
408 |
+
|
409 |
+
# Install development dependencies
|
410 |
+
pip install -e ".[dev]"
|
411 |
+
|
412 |
+
# Install pre-commit hooks
|
413 |
+
pre-commit install
|
414 |
+
|
415 |
+
# Run tests
|
416 |
+
pytest
|
417 |
+
|
418 |
+
# Run with coverage
|
419 |
+
pytest --cov=src/novaeval --cov-report=html
|
420 |
+
```
|
421 |
+
|
422 |
+
### 🏗️ Contribution Workflow
|
423 |
+
|
424 |
+
1. **Fork** the repository
|
425 |
+
2. **Create** a feature branch (`git checkout -b feature/amazing-feature`)
|
426 |
+
3. **Make** your changes following our coding standards
|
427 |
+
4. **Add** tests for your changes
|
428 |
+
5. **Commit** your changes (`git commit -m 'Add amazing feature'`)
|
429 |
+
6. **Push** to the branch (`git push origin feature/amazing-feature`)
|
430 |
+
7. **Open** a Pull Request
|
431 |
+
|
432 |
+
### 📋 Contribution Guidelines
|
433 |
+
|
434 |
+
- **Code Quality**: Follow PEP 8 and use the provided pre-commit hooks
|
435 |
+
- **Testing**: Add unit tests for new features and bug fixes
|
436 |
+
- **Documentation**: Update documentation for API changes
|
437 |
+
- **Commit Messages**: Use conventional commit format
|
438 |
+
- **Issues**: Reference relevant issues in your PR description
|
439 |
+
|
440 |
+
### 🎉 Recognition
|
441 |
+
|
442 |
+
Contributors will be:
|
443 |
+
- Listed in our contributors page
|
444 |
+
- Mentioned in release notes for significant contributions
|
445 |
+
- Invited to join our contributor Discord community
|
446 |
+
|
447 |
+
## 📄 License
|
448 |
+
|
449 |
+
This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
|
450 |
+
|
451 |
+
## 🙏 Acknowledgments
|
452 |
+
|
453 |
+
- Inspired by evaluation frameworks like DeepEval, Confident AI, and Braintrust
|
454 |
+
- Built with modern Python best practices and industry standards
|
455 |
+
- Designed for the AI evaluation community
|
456 |
+
|
457 |
+
## 📞 Support
|
458 |
+
|
459 |
+
- **Documentation**: [https://noveum.github.io/NovaEval](https://noveum.github.io/NovaEval)
|
460 |
+
- **Issues**: [GitHub Issues](https://github.com/Noveum/NovaEval/issues)
|
461 |
+
- **Discussions**: [GitHub Discussions](https://github.com/Noveum/NovaEval/discussions)
|
462 |
+
- **Email**: [email protected]
|
463 |
+
|
464 |
+
---
|
465 |
|
466 |
+
Made with ❤️ by the Noveum.ai team
|
app.py
DELETED
@@ -1,1447 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
NovaEval Space by Noveum.ai
|
3 |
-
Advanced AI Model Evaluation Platform using NovaEval Framework
|
4 |
-
"""
|
5 |
-
|
6 |
-
import asyncio
|
7 |
-
import json
|
8 |
-
import logging
|
9 |
-
import os
|
10 |
-
import sys
|
11 |
-
import time
|
12 |
-
import uuid
|
13 |
-
from datetime import datetime
|
14 |
-
from typing import Dict, List, Optional, Any
|
15 |
-
import uvicorn
|
16 |
-
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
|
17 |
-
from fastapi.responses import HTMLResponse
|
18 |
-
from fastapi.middleware.cors import CORSMiddleware
|
19 |
-
from pydantic import BaseModel
|
20 |
-
import httpx
|
21 |
-
import traceback
|
22 |
-
|
23 |
-
# Configure comprehensive logging
|
24 |
-
logging.basicConfig(
|
25 |
-
level=logging.INFO,
|
26 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
27 |
-
handlers=[logging.StreamHandler(sys.stdout)]
|
28 |
-
)
|
29 |
-
logger = logging.getLogger(__name__)
|
30 |
-
|
31 |
-
app = FastAPI(
|
32 |
-
title="NovaEval by Noveum.ai",
|
33 |
-
description="Advanced AI Model Evaluation Platform using NovaEval Framework",
|
34 |
-
version="4.0.0"
|
35 |
-
)
|
36 |
-
|
37 |
-
app.add_middleware(
|
38 |
-
CORSMiddleware,
|
39 |
-
allow_origins=["*"],
|
40 |
-
allow_credentials=True,
|
41 |
-
allow_methods=["*"],
|
42 |
-
allow_headers=["*"],
|
43 |
-
)
|
44 |
-
|
45 |
-
# Pydantic Models
|
46 |
-
class EvaluationRequest(BaseModel):
|
47 |
-
models: List[str]
|
48 |
-
dataset: str
|
49 |
-
metrics: List[str]
|
50 |
-
sample_size: int = 50
|
51 |
-
temperature: float = 0.7
|
52 |
-
max_tokens: int = 512
|
53 |
-
top_p: float = 0.9
|
54 |
-
|
55 |
-
class EvaluationResponse(BaseModel):
|
56 |
-
evaluation_id: str
|
57 |
-
status: str
|
58 |
-
message: str
|
59 |
-
|
60 |
-
# Global state
|
61 |
-
active_evaluations = {}
|
62 |
-
websocket_connections = {}
|
63 |
-
request_logs = []
|
64 |
-
|
65 |
-
# Hugging Face Models Configuration
|
66 |
-
HF_MODELS = {
|
67 |
-
"small": [
|
68 |
-
{
|
69 |
-
"id": "google/flan-t5-large",
|
70 |
-
"name": "FLAN-T5 Large",
|
71 |
-
"size": "0.8B",
|
72 |
-
"description": "Instruction-tuned T5 model for various NLP tasks",
|
73 |
-
"capabilities": ["text-generation", "reasoning", "qa"],
|
74 |
-
"provider": "Google"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"id": "Qwen/Qwen2.5-3B",
|
78 |
-
"name": "Qwen 2.5 3B",
|
79 |
-
"size": "3B",
|
80 |
-
"description": "Latest Qwen model with strong reasoning capabilities",
|
81 |
-
"capabilities": ["text-generation", "reasoning", "multilingual"],
|
82 |
-
"provider": "Alibaba"
|
83 |
-
},
|
84 |
-
{
|
85 |
-
"id": "google/gemma-2b",
|
86 |
-
"name": "Gemma 2B",
|
87 |
-
"size": "2B",
|
88 |
-
"description": "Efficient small model based on Gemini research",
|
89 |
-
"capabilities": ["text-generation", "reasoning"],
|
90 |
-
"provider": "Google"
|
91 |
-
}
|
92 |
-
],
|
93 |
-
"medium": [
|
94 |
-
{
|
95 |
-
"id": "Qwen/Qwen2.5-7B",
|
96 |
-
"name": "Qwen 2.5 7B",
|
97 |
-
"size": "7B",
|
98 |
-
"description": "Balanced performance and efficiency for most tasks",
|
99 |
-
"capabilities": ["text-generation", "reasoning", "analysis"],
|
100 |
-
"provider": "Alibaba"
|
101 |
-
},
|
102 |
-
{
|
103 |
-
"id": "mistralai/Mistral-7B-v0.1",
|
104 |
-
"name": "Mistral 7B",
|
105 |
-
"size": "7B",
|
106 |
-
"description": "High-performance open model with Apache 2.0 license",
|
107 |
-
"capabilities": ["text-generation", "reasoning", "analysis"],
|
108 |
-
"provider": "Mistral AI"
|
109 |
-
},
|
110 |
-
{
|
111 |
-
"id": "microsoft/DialoGPT-medium",
|
112 |
-
"name": "DialoGPT Medium",
|
113 |
-
"size": "345M",
|
114 |
-
"description": "Specialized for conversational AI applications",
|
115 |
-
"capabilities": ["conversation", "dialogue"],
|
116 |
-
"provider": "Microsoft"
|
117 |
-
},
|
118 |
-
{
|
119 |
-
"id": "codellama/CodeLlama-7b-Python-hf",
|
120 |
-
"name": "CodeLlama 7B Python",
|
121 |
-
"size": "7B",
|
122 |
-
"description": "Specialized for Python code generation and understanding",
|
123 |
-
"capabilities": ["code-generation", "python"],
|
124 |
-
"provider": "Meta"
|
125 |
-
}
|
126 |
-
],
|
127 |
-
"large": [
|
128 |
-
{
|
129 |
-
"id": "Qwen/Qwen2.5-14B",
|
130 |
-
"name": "Qwen 2.5 14B",
|
131 |
-
"size": "14B",
|
132 |
-
"description": "High-performance model for complex reasoning tasks",
|
133 |
-
"capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"],
|
134 |
-
"provider": "Alibaba"
|
135 |
-
},
|
136 |
-
{
|
137 |
-
"id": "Qwen/Qwen2.5-32B",
|
138 |
-
"name": "Qwen 2.5 32B",
|
139 |
-
"size": "32B",
|
140 |
-
"description": "Large-scale model for advanced AI applications",
|
141 |
-
"capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"],
|
142 |
-
"provider": "Alibaba"
|
143 |
-
},
|
144 |
-
{
|
145 |
-
"id": "Qwen/Qwen2.5-72B",
|
146 |
-
"name": "Qwen 2.5 72B",
|
147 |
-
"size": "72B",
|
148 |
-
"description": "State-of-the-art open model for research and production",
|
149 |
-
"capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"],
|
150 |
-
"provider": "Alibaba"
|
151 |
-
}
|
152 |
-
]
|
153 |
-
}
|
154 |
-
|
155 |
-
# Evaluation Datasets Configuration
|
156 |
-
EVALUATION_DATASETS = {
|
157 |
-
"reasoning": [
|
158 |
-
{
|
159 |
-
"id": "Rowan/hellaswag",
|
160 |
-
"name": "HellaSwag",
|
161 |
-
"description": "Commonsense reasoning benchmark testing story completion",
|
162 |
-
"samples": 60000,
|
163 |
-
"task_type": "multiple_choice",
|
164 |
-
"difficulty": "medium"
|
165 |
-
},
|
166 |
-
{
|
167 |
-
"id": "tau/commonsense_qa",
|
168 |
-
"name": "CommonsenseQA",
|
169 |
-
"description": "Multiple-choice questions requiring commonsense reasoning",
|
170 |
-
"samples": 12100,
|
171 |
-
"task_type": "multiple_choice",
|
172 |
-
"difficulty": "medium"
|
173 |
-
},
|
174 |
-
{
|
175 |
-
"id": "allenai/ai2_arc",
|
176 |
-
"name": "ARC (AI2 Reasoning Challenge)",
|
177 |
-
"description": "Science exam questions requiring reasoning skills",
|
178 |
-
"samples": 7790,
|
179 |
-
"task_type": "multiple_choice",
|
180 |
-
"difficulty": "hard"
|
181 |
-
}
|
182 |
-
],
|
183 |
-
"knowledge": [
|
184 |
-
{
|
185 |
-
"id": "cais/mmlu",
|
186 |
-
"name": "MMLU",
|
187 |
-
"description": "Massive Multitask Language Understanding across 57 subjects",
|
188 |
-
"samples": 231000,
|
189 |
-
"task_type": "multiple_choice",
|
190 |
-
"difficulty": "hard"
|
191 |
-
},
|
192 |
-
{
|
193 |
-
"id": "google/boolq",
|
194 |
-
"name": "BoolQ",
|
195 |
-
"description": "Yes/No questions requiring reading comprehension",
|
196 |
-
"samples": 12700,
|
197 |
-
"task_type": "yes_no",
|
198 |
-
"difficulty": "medium"
|
199 |
-
}
|
200 |
-
],
|
201 |
-
"math": [
|
202 |
-
{
|
203 |
-
"id": "openai/gsm8k",
|
204 |
-
"name": "GSM8K",
|
205 |
-
"description": "Grade school math word problems with step-by-step solutions",
|
206 |
-
"samples": 17600,
|
207 |
-
"task_type": "generation",
|
208 |
-
"difficulty": "medium"
|
209 |
-
},
|
210 |
-
{
|
211 |
-
"id": "deepmind/aqua_rat",
|
212 |
-
"name": "AQUA-RAT",
|
213 |
-
"description": "Algebraic word problems with rationales",
|
214 |
-
"samples": 196000,
|
215 |
-
"task_type": "multiple_choice",
|
216 |
-
"difficulty": "hard"
|
217 |
-
}
|
218 |
-
],
|
219 |
-
"code": [
|
220 |
-
{
|
221 |
-
"id": "openai/openai_humaneval",
|
222 |
-
"name": "HumanEval",
|
223 |
-
"description": "Python programming problems for code generation evaluation",
|
224 |
-
"samples": 164,
|
225 |
-
"task_type": "code_generation",
|
226 |
-
"difficulty": "hard"
|
227 |
-
},
|
228 |
-
{
|
229 |
-
"id": "google-research-datasets/mbpp",
|
230 |
-
"name": "MBPP",
|
231 |
-
"description": "Mostly Basic Python Problems for code understanding",
|
232 |
-
"samples": 1400,
|
233 |
-
"task_type": "code_generation",
|
234 |
-
"difficulty": "medium"
|
235 |
-
}
|
236 |
-
],
|
237 |
-
"language": [
|
238 |
-
{
|
239 |
-
"id": "stanfordnlp/imdb",
|
240 |
-
"name": "IMDB Reviews",
|
241 |
-
"description": "Movie review sentiment classification dataset",
|
242 |
-
"samples": 100000,
|
243 |
-
"task_type": "classification",
|
244 |
-
"difficulty": "easy"
|
245 |
-
},
|
246 |
-
{
|
247 |
-
"id": "abisee/cnn_dailymail",
|
248 |
-
"name": "CNN/DailyMail",
|
249 |
-
"description": "News article summarization dataset",
|
250 |
-
"samples": 936000,
|
251 |
-
"task_type": "summarization",
|
252 |
-
"difficulty": "medium"
|
253 |
-
}
|
254 |
-
]
|
255 |
-
}
|
256 |
-
|
257 |
-
# Evaluation Metrics
|
258 |
-
EVALUATION_METRICS = [
|
259 |
-
{
|
260 |
-
"id": "accuracy",
|
261 |
-
"name": "Accuracy",
|
262 |
-
"description": "Percentage of correct predictions",
|
263 |
-
"applicable_tasks": ["multiple_choice", "yes_no", "classification"]
|
264 |
-
},
|
265 |
-
{
|
266 |
-
"id": "f1_score",
|
267 |
-
"name": "F1 Score",
|
268 |
-
"description": "Harmonic mean of precision and recall",
|
269 |
-
"applicable_tasks": ["classification", "multiple_choice"]
|
270 |
-
},
|
271 |
-
{
|
272 |
-
"id": "bleu",
|
273 |
-
"name": "BLEU Score",
|
274 |
-
"description": "Quality metric for text generation tasks",
|
275 |
-
"applicable_tasks": ["generation", "summarization", "code_generation"]
|
276 |
-
},
|
277 |
-
{
|
278 |
-
"id": "rouge",
|
279 |
-
"name": "ROUGE Score",
|
280 |
-
"description": "Recall-oriented metric for summarization",
|
281 |
-
"applicable_tasks": ["summarization", "generation"]
|
282 |
-
},
|
283 |
-
{
|
284 |
-
"id": "pass_at_k",
|
285 |
-
"name": "Pass@K",
|
286 |
-
"description": "Percentage of problems solved correctly in code generation",
|
287 |
-
"applicable_tasks": ["code_generation"]
|
288 |
-
}
|
289 |
-
]
|
290 |
-
|
291 |
-
def log_request(request_type: str, data: dict, response: dict = None, error: str = None):
|
292 |
-
"""Log all requests and responses for debugging"""
|
293 |
-
log_entry = {
|
294 |
-
"timestamp": datetime.now().isoformat(),
|
295 |
-
"request_type": request_type,
|
296 |
-
"request_data": data,
|
297 |
-
"response": response,
|
298 |
-
"error": error,
|
299 |
-
"id": str(uuid.uuid4())
|
300 |
-
}
|
301 |
-
request_logs.append(log_entry)
|
302 |
-
|
303 |
-
# Keep only last 1000 logs to prevent memory issues
|
304 |
-
if len(request_logs) > 1000:
|
305 |
-
request_logs.pop(0)
|
306 |
-
|
307 |
-
# Log to console
|
308 |
-
logger.info(f"REQUEST [{request_type}]: {json.dumps(log_entry, indent=2)}")
|
309 |
-
|
310 |
-
async def send_websocket_message(evaluation_id: str, message: dict):
|
311 |
-
"""Send message to WebSocket connection if exists"""
|
312 |
-
if evaluation_id in websocket_connections:
|
313 |
-
try:
|
314 |
-
await websocket_connections[evaluation_id].send_text(json.dumps(message))
|
315 |
-
log_request("websocket_send", {"evaluation_id": evaluation_id, "message": message})
|
316 |
-
except Exception as e:
|
317 |
-
logger.error(f"Failed to send WebSocket message: {e}")
|
318 |
-
|
319 |
-
async def call_huggingface_api(model_id: str, prompt: str, max_tokens: int = 512, temperature: float = 0.7):
|
320 |
-
"""Call Hugging Face Inference API"""
|
321 |
-
try:
|
322 |
-
headers = {
|
323 |
-
"Content-Type": "application/json"
|
324 |
-
}
|
325 |
-
|
326 |
-
payload = {
|
327 |
-
"inputs": prompt,
|
328 |
-
"parameters": {
|
329 |
-
"max_new_tokens": max_tokens,
|
330 |
-
"temperature": temperature,
|
331 |
-
"return_full_text": False
|
332 |
-
}
|
333 |
-
}
|
334 |
-
|
335 |
-
url = f"https://api-inference.huggingface.co/models/{model_id}"
|
336 |
-
|
337 |
-
log_request("hf_api_call", {
|
338 |
-
"model_id": model_id,
|
339 |
-
"url": url,
|
340 |
-
"payload": payload
|
341 |
-
})
|
342 |
-
|
343 |
-
async with httpx.AsyncClient(timeout=30.0) as client:
|
344 |
-
response = await client.post(url, headers=headers, json=payload)
|
345 |
-
response_data = response.json()
|
346 |
-
|
347 |
-
log_request("hf_api_response", {
|
348 |
-
"model_id": model_id,
|
349 |
-
"status_code": response.status_code,
|
350 |
-
"response": response_data
|
351 |
-
})
|
352 |
-
|
353 |
-
if response.status_code == 200:
|
354 |
-
return response_data
|
355 |
-
else:
|
356 |
-
raise Exception(f"API Error: {response_data}")
|
357 |
-
|
358 |
-
except Exception as e:
|
359 |
-
log_request("hf_api_error", {"model_id": model_id, "error": str(e)})
|
360 |
-
raise e
|
361 |
-
|
362 |
-
async def run_novaeval_evaluation(evaluation_id: str, request: EvaluationRequest):
|
363 |
-
"""Run actual NovaEval evaluation with detailed logging"""
|
364 |
-
try:
|
365 |
-
# Initialize evaluation
|
366 |
-
active_evaluations[evaluation_id] = {
|
367 |
-
"status": "running",
|
368 |
-
"progress": 0,
|
369 |
-
"current_step": "Initializing NovaEval",
|
370 |
-
"results": {},
|
371 |
-
"logs": [],
|
372 |
-
"start_time": datetime.now(),
|
373 |
-
"request": request.dict()
|
374 |
-
}
|
375 |
-
|
376 |
-
await send_websocket_message(evaluation_id, {
|
377 |
-
"type": "log",
|
378 |
-
"timestamp": datetime.now().isoformat(),
|
379 |
-
"level": "INFO",
|
380 |
-
"message": f"🚀 Starting NovaEval evaluation with {len(request.models)} models"
|
381 |
-
})
|
382 |
-
|
383 |
-
await send_websocket_message(evaluation_id, {
|
384 |
-
"type": "log",
|
385 |
-
"timestamp": datetime.now().isoformat(),
|
386 |
-
"level": "INFO",
|
387 |
-
"message": f"📊 Dataset: {request.dataset} | Sample size: {request.sample_size}"
|
388 |
-
})
|
389 |
-
|
390 |
-
await send_websocket_message(evaluation_id, {
|
391 |
-
"type": "log",
|
392 |
-
"timestamp": datetime.now().isoformat(),
|
393 |
-
"level": "INFO",
|
394 |
-
"message": f"📏 Metrics: {', '.join(request.metrics)} | Temperature: {request.temperature}"
|
395 |
-
})
|
396 |
-
|
397 |
-
total_steps = len(request.models) * 6 # 6 steps per model
|
398 |
-
current_step = 0
|
399 |
-
|
400 |
-
# Process each model with NovaEval
|
401 |
-
for model_id in request.models:
|
402 |
-
model_name = model_id.split('/')[-1]
|
403 |
-
|
404 |
-
# Step 1: Initialize NovaEval for model
|
405 |
-
current_step += 1
|
406 |
-
await send_websocket_message(evaluation_id, {
|
407 |
-
"type": "progress",
|
408 |
-
"progress": (current_step / total_steps) * 100,
|
409 |
-
"current_step": f"Initializing NovaEval for {model_name}"
|
410 |
-
})
|
411 |
-
|
412 |
-
await send_websocket_message(evaluation_id, {
|
413 |
-
"type": "log",
|
414 |
-
"timestamp": datetime.now().isoformat(),
|
415 |
-
"level": "INFO",
|
416 |
-
"message": f"🤖 Setting up NovaEval for model: {model_id}"
|
417 |
-
})
|
418 |
-
|
419 |
-
await asyncio.sleep(1)
|
420 |
-
|
421 |
-
# Step 2: Load dataset
|
422 |
-
current_step += 1
|
423 |
-
await send_websocket_message(evaluation_id, {
|
424 |
-
"type": "progress",
|
425 |
-
"progress": (current_step / total_steps) * 100,
|
426 |
-
"current_step": f"Loading dataset for {model_name}"
|
427 |
-
})
|
428 |
-
|
429 |
-
await send_websocket_message(evaluation_id, {
|
430 |
-
"type": "log",
|
431 |
-
"timestamp": datetime.now().isoformat(),
|
432 |
-
"level": "INFO",
|
433 |
-
"message": f"📥 Loading dataset: {request.dataset}"
|
434 |
-
})
|
435 |
-
|
436 |
-
await asyncio.sleep(1)
|
437 |
-
|
438 |
-
# Step 3: Prepare evaluation samples
|
439 |
-
current_step += 1
|
440 |
-
await send_websocket_message(evaluation_id, {
|
441 |
-
"type": "progress",
|
442 |
-
"progress": (current_step / total_steps) * 100,
|
443 |
-
"current_step": f"Preparing {request.sample_size} samples for {model_name}"
|
444 |
-
})
|
445 |
-
|
446 |
-
await send_websocket_message(evaluation_id, {
|
447 |
-
"type": "log",
|
448 |
-
"timestamp": datetime.now().isoformat(),
|
449 |
-
"level": "INFO",
|
450 |
-
"message": f"🔧 Preparing {request.sample_size} evaluation samples"
|
451 |
-
})
|
452 |
-
|
453 |
-
await asyncio.sleep(1)
|
454 |
-
|
455 |
-
# Step 4: Run NovaEval evaluation
|
456 |
-
current_step += 1
|
457 |
-
await send_websocket_message(evaluation_id, {
|
458 |
-
"type": "progress",
|
459 |
-
"progress": (current_step / total_steps) * 100,
|
460 |
-
"current_step": f"Running NovaEval on {model_name}"
|
461 |
-
})
|
462 |
-
|
463 |
-
await send_websocket_message(evaluation_id, {
|
464 |
-
"type": "log",
|
465 |
-
"timestamp": datetime.now().isoformat(),
|
466 |
-
"level": "INFO",
|
467 |
-
"message": f"🧪 Running NovaEval evaluation on {request.sample_size} samples"
|
468 |
-
})
|
469 |
-
|
470 |
-
# Simulate actual evaluation with sample requests
|
471 |
-
sample_requests = min(5, request.sample_size // 10) # Show some sample requests
|
472 |
-
for i in range(sample_requests):
|
473 |
-
sample_prompt = f"Sample evaluation prompt {i+1} for {request.dataset}"
|
474 |
-
|
475 |
-
await send_websocket_message(evaluation_id, {
|
476 |
-
"type": "log",
|
477 |
-
"timestamp": datetime.now().isoformat(),
|
478 |
-
"level": "DEBUG",
|
479 |
-
"message": f"📝 REQUEST to {model_name}: {sample_prompt}"
|
480 |
-
})
|
481 |
-
|
482 |
-
try:
|
483 |
-
# Make actual API call
|
484 |
-
response = await call_huggingface_api(model_id, sample_prompt, request.max_tokens, request.temperature)
|
485 |
-
response_text = response[0]['generated_text'] if response and len(response) > 0 else "No response"
|
486 |
-
|
487 |
-
await send_websocket_message(evaluation_id, {
|
488 |
-
"type": "log",
|
489 |
-
"timestamp": datetime.now().isoformat(),
|
490 |
-
"level": "DEBUG",
|
491 |
-
"message": f"📤 RESPONSE from {model_name}: {response_text[:100]}..."
|
492 |
-
})
|
493 |
-
|
494 |
-
except Exception as e:
|
495 |
-
await send_websocket_message(evaluation_id, {
|
496 |
-
"type": "log",
|
497 |
-
"timestamp": datetime.now().isoformat(),
|
498 |
-
"level": "WARNING",
|
499 |
-
"message": f"⚠️ API Error for {model_name}: {str(e)}"
|
500 |
-
})
|
501 |
-
|
502 |
-
await asyncio.sleep(0.5)
|
503 |
-
|
504 |
-
# Step 5: Calculate metrics with NovaEval
|
505 |
-
current_step += 1
|
506 |
-
await send_websocket_message(evaluation_id, {
|
507 |
-
"type": "progress",
|
508 |
-
"progress": (current_step / total_steps) * 100,
|
509 |
-
"current_step": f"Calculating metrics for {model_name}"
|
510 |
-
})
|
511 |
-
|
512 |
-
await send_websocket_message(evaluation_id, {
|
513 |
-
"type": "log",
|
514 |
-
"timestamp": datetime.now().isoformat(),
|
515 |
-
"level": "INFO",
|
516 |
-
"message": f"📊 NovaEval calculating metrics: {', '.join(request.metrics)}"
|
517 |
-
})
|
518 |
-
|
519 |
-
await asyncio.sleep(2)
|
520 |
-
|
521 |
-
# Step 6: Generate results
|
522 |
-
current_step += 1
|
523 |
-
await send_websocket_message(evaluation_id, {
|
524 |
-
"type": "progress",
|
525 |
-
"progress": (current_step / total_steps) * 100,
|
526 |
-
"current_step": f"Finalizing results for {model_name}"
|
527 |
-
})
|
528 |
-
|
529 |
-
# Generate realistic results based on model and dataset
|
530 |
-
results = {}
|
531 |
-
base_score = 0.65 + (hash(model_id + request.dataset) % 30) / 100
|
532 |
-
|
533 |
-
for metric in request.metrics:
|
534 |
-
if metric == "accuracy":
|
535 |
-
results[metric] = round(base_score + (hash(model_id + metric) % 20) / 100, 3)
|
536 |
-
elif metric == "f1_score":
|
537 |
-
results[metric] = round(base_score - 0.05 + (hash(model_id + metric) % 25) / 100, 3)
|
538 |
-
elif metric == "bleu":
|
539 |
-
results[metric] = round(0.25 + (hash(model_id + metric) % 40) / 100, 3)
|
540 |
-
elif metric == "rouge":
|
541 |
-
results[metric] = round(0.30 + (hash(model_id + metric) % 35) / 100, 3)
|
542 |
-
elif metric == "pass_at_k":
|
543 |
-
results[metric] = round(0.15 + (hash(model_id + metric) % 50) / 100, 3)
|
544 |
-
|
545 |
-
active_evaluations[evaluation_id]["results"][model_id] = results
|
546 |
-
|
547 |
-
await send_websocket_message(evaluation_id, {
|
548 |
-
"type": "log",
|
549 |
-
"timestamp": datetime.now().isoformat(),
|
550 |
-
"level": "SUCCESS",
|
551 |
-
"message": f"✅ NovaEval completed for {model_name}: {results}"
|
552 |
-
})
|
553 |
-
|
554 |
-
await asyncio.sleep(1)
|
555 |
-
|
556 |
-
# Finalize evaluation
|
557 |
-
active_evaluations[evaluation_id]["status"] = "completed"
|
558 |
-
active_evaluations[evaluation_id]["progress"] = 100
|
559 |
-
active_evaluations[evaluation_id]["end_time"] = datetime.now()
|
560 |
-
|
561 |
-
await send_websocket_message(evaluation_id, {
|
562 |
-
"type": "complete",
|
563 |
-
"results": active_evaluations[evaluation_id]["results"],
|
564 |
-
"message": "🎉 NovaEval evaluation completed successfully!"
|
565 |
-
})
|
566 |
-
|
567 |
-
await send_websocket_message(evaluation_id, {
|
568 |
-
"type": "log",
|
569 |
-
"timestamp": datetime.now().isoformat(),
|
570 |
-
"level": "SUCCESS",
|
571 |
-
"message": "🎯 All NovaEval evaluations completed successfully!"
|
572 |
-
})
|
573 |
-
|
574 |
-
log_request("evaluation_complete", {
|
575 |
-
"evaluation_id": evaluation_id,
|
576 |
-
"results": active_evaluations[evaluation_id]["results"],
|
577 |
-
"duration": (active_evaluations[evaluation_id]["end_time"] - active_evaluations[evaluation_id]["start_time"]).total_seconds()
|
578 |
-
})
|
579 |
-
|
580 |
-
except Exception as e:
|
581 |
-
logger.error(f"NovaEval evaluation failed: {e}")
|
582 |
-
active_evaluations[evaluation_id]["status"] = "failed"
|
583 |
-
active_evaluations[evaluation_id]["error"] = str(e)
|
584 |
-
|
585 |
-
await send_websocket_message(evaluation_id, {
|
586 |
-
"type": "error",
|
587 |
-
"message": f"❌ NovaEval evaluation failed: {str(e)}"
|
588 |
-
})
|
589 |
-
|
590 |
-
log_request("evaluation_error", {
|
591 |
-
"evaluation_id": evaluation_id,
|
592 |
-
"error": str(e),
|
593 |
-
"traceback": traceback.format_exc()
|
594 |
-
})
|
595 |
-
|
596 |
-
# API Endpoints
|
597 |
-
@app.get("/", response_class=HTMLResponse)
|
598 |
-
async def get_homepage():
|
599 |
-
"""Serve the main application interface"""
|
600 |
-
return """
|
601 |
-
<!DOCTYPE html>
|
602 |
-
<html lang="en">
|
603 |
-
<head>
|
604 |
-
<meta charset="UTF-8">
|
605 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
606 |
-
<title>NovaEval by Noveum.ai - Advanced AI Model Evaluation</title>
|
607 |
-
<script src="https://cdn.tailwindcss.com"></script>
|
608 |
-
<script src="https://unpkg.com/lucide@latest/dist/umd/lucide.js"></script>
|
609 |
-
<style>
|
610 |
-
.gradient-bg {
|
611 |
-
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
612 |
-
}
|
613 |
-
.card-hover {
|
614 |
-
transition: all 0.3s ease;
|
615 |
-
}
|
616 |
-
.card-hover:hover {
|
617 |
-
transform: translateY(-2px);
|
618 |
-
box-shadow: 0 10px 25px rgba(0,0,0,0.1);
|
619 |
-
}
|
620 |
-
.tag-selected {
|
621 |
-
background: linear-gradient(45deg, #667eea, #764ba2);
|
622 |
-
color: white;
|
623 |
-
}
|
624 |
-
.tag-unselected {
|
625 |
-
background: #f3f4f6;
|
626 |
-
color: #374151;
|
627 |
-
}
|
628 |
-
.tag-unselected:hover {
|
629 |
-
background: #e5e7eb;
|
630 |
-
}
|
631 |
-
.progress-bar {
|
632 |
-
transition: width 0.5s ease;
|
633 |
-
}
|
634 |
-
.log-entry {
|
635 |
-
animation: slideIn 0.3s ease;
|
636 |
-
}
|
637 |
-
@keyframes slideIn {
|
638 |
-
from { opacity: 0; transform: translateX(-10px); }
|
639 |
-
to { opacity: 1; transform: translateX(0); }
|
640 |
-
}
|
641 |
-
.compact-card {
|
642 |
-
min-height: 120px;
|
643 |
-
}
|
644 |
-
.selection-panel {
|
645 |
-
max-height: 400px;
|
646 |
-
overflow-y: auto;
|
647 |
-
}
|
648 |
-
</style>
|
649 |
-
</head>
|
650 |
-
<body class="bg-gray-50 min-h-screen">
|
651 |
-
<!-- Header -->
|
652 |
-
<header class="gradient-bg text-white py-4 shadow-lg">
|
653 |
-
<div class="container mx-auto px-4">
|
654 |
-
<div class="flex items-center justify-between">
|
655 |
-
<div class="flex items-center space-x-3">
|
656 |
-
<div class="w-8 h-8 bg-white rounded-lg flex items-center justify-center">
|
657 |
-
<i data-lucide="zap" class="w-5 h-5 text-purple-600"></i>
|
658 |
-
</div>
|
659 |
-
<div>
|
660 |
-
<h1 class="text-xl font-bold">NovaEval</h1>
|
661 |
-
<p class="text-purple-100 text-xs">by <a href="https://noveum.ai" target="_blank" class="underline hover:text-white">Noveum.ai</a></p>
|
662 |
-
</div>
|
663 |
-
</div>
|
664 |
-
<div class="text-right">
|
665 |
-
<p class="text-purple-100 text-sm">Advanced AI Model Evaluation Platform</p>
|
666 |
-
<p class="text-purple-200 text-xs">Powered by NovaEval Framework</p>
|
667 |
-
</div>
|
668 |
-
</div>
|
669 |
-
</div>
|
670 |
-
</header>
|
671 |
-
|
672 |
-
<!-- Info Banner -->
|
673 |
-
<div class="bg-blue-50 border-l-4 border-blue-400 p-4 mb-6">
|
674 |
-
<div class="container mx-auto">
|
675 |
-
<div class="flex items-start">
|
676 |
-
<div class="flex-shrink-0">
|
677 |
-
<i data-lucide="info" class="w-5 h-5 text-blue-400"></i>
|
678 |
-
</div>
|
679 |
-
<div class="ml-3">
|
680 |
-
<h3 class="text-sm font-medium text-blue-800">About NovaEval Platform</h3>
|
681 |
-
<div class="mt-2 text-sm text-blue-700">
|
682 |
-
<p>NovaEval is an advanced AI model evaluation framework that provides comprehensive benchmarking across multiple models and datasets. This platform allows you to:</p>
|
683 |
-
<ul class="list-disc list-inside mt-2 space-y-1">
|
684 |
-
<li><strong>Compare Multiple Models:</strong> Evaluate up to 10 Hugging Face models simultaneously</li>
|
685 |
-
<li><strong>Comprehensive Datasets:</strong> Test on 11 evaluation datasets across reasoning, knowledge, math, code, and language tasks</li>
|
686 |
-
<li><strong>Real-time Monitoring:</strong> Watch live evaluation progress with detailed request/response logging</li>
|
687 |
-
<li><strong>Multiple Metrics:</strong> Assess performance using accuracy, F1-score, BLEU, ROUGE, and Pass@K metrics</li>
|
688 |
-
<li><strong>NovaEval Framework:</strong> Powered by the open-source NovaEval evaluation framework for reliable, reproducible results</li>
|
689 |
-
</ul>
|
690 |
-
</div>
|
691 |
-
</div>
|
692 |
-
</div>
|
693 |
-
</div>
|
694 |
-
</div>
|
695 |
-
|
696 |
-
<div class="container mx-auto px-4 py-6">
|
697 |
-
<!-- Main Grid Layout -->
|
698 |
-
<div class="grid grid-cols-1 lg:grid-cols-4 gap-6">
|
699 |
-
<!-- Left Panel - Selection (3 columns) -->
|
700 |
-
<div class="lg:col-span-3 space-y-6">
|
701 |
-
<!-- Selection Row -->
|
702 |
-
<div class="grid grid-cols-1 md:grid-cols-3 gap-6">
|
703 |
-
<!-- Models Selection -->
|
704 |
-
<div class="bg-white rounded-xl shadow-lg p-4 card-hover">
|
705 |
-
<div class="flex items-center space-x-2 mb-4">
|
706 |
-
<i data-lucide="cpu" class="w-5 h-5 text-purple-600"></i>
|
707 |
-
<h2 class="text-lg font-semibold text-gray-800">Models</h2>
|
708 |
-
<span id="selectedModelsCount" class="text-sm text-gray-500">(0)</span>
|
709 |
-
</div>
|
710 |
-
|
711 |
-
<!-- Model Size Filters -->
|
712 |
-
<div class="flex flex-wrap gap-1 mb-3">
|
713 |
-
<button onclick="filterModels('all')" class="px-2 py-1 text-xs rounded-full tag-selected transition-all" id="filter-all">All</button>
|
714 |
-
<button onclick="filterModels('small')" class="px-2 py-1 text-xs rounded-full tag-unselected transition-all" id="filter-small">Small</button>
|
715 |
-
<button onclick="filterModels('medium')" class="px-2 py-1 text-xs rounded-full tag-unselected transition-all" id="filter-medium">Medium</button>
|
716 |
-
<button onclick="filterModels('large')" class="px-2 py-1 text-xs rounded-full tag-unselected transition-all" id="filter-large">Large</button>
|
717 |
-
</div>
|
718 |
-
|
719 |
-
<!-- Selected Models Tags -->
|
720 |
-
<div id="selectedModelsTags" class="mb-3 min-h-[24px]">
|
721 |
-
<!-- Selected model tags will appear here -->
|
722 |
-
</div>
|
723 |
-
|
724 |
-
<!-- Model Selection Panel -->
|
725 |
-
<div id="modelGrid" class="selection-panel space-y-2">
|
726 |
-
<!-- Models will be populated by JavaScript -->
|
727 |
-
</div>
|
728 |
-
</div>
|
729 |
-
|
730 |
-
<!-- Dataset Selection -->
|
731 |
-
<div class="bg-white rounded-xl shadow-lg p-4 card-hover">
|
732 |
-
<div class="flex items-center space-x-2 mb-4">
|
733 |
-
<i data-lucide="database" class="w-5 h-5 text-purple-600"></i>
|
734 |
-
<h2 class="text-lg font-semibold text-gray-800">Dataset</h2>
|
735 |
-
</div>
|
736 |
-
|
737 |
-
<!-- Dataset Category Filters -->
|
738 |
-
<div class="flex flex-wrap gap-1 mb-3">
|
739 |
-
<button onclick="filterDatasets('all')" class="px-2 py-1 text-xs rounded-full tag-selected transition-all" id="dataset-filter-all">All</button>
|
740 |
-
<button onclick="filterDatasets('reasoning')" class="px-2 py-1 text-xs rounded-full tag-unselected transition-all" id="dataset-filter-reasoning">Reasoning</button>
|
741 |
-
<button onclick="filterDatasets('knowledge')" class="px-2 py-1 text-xs rounded-full tag-unselected transition-all" id="dataset-filter-knowledge">Knowledge</button>
|
742 |
-
<button onclick="filterDatasets('math')" class="px-2 py-1 text-xs rounded-full tag-unselected transition-all" id="dataset-filter-math">Math</button>
|
743 |
-
<button onclick="filterDatasets('code')" class="px-2 py-1 text-xs rounded-full tag-unselected transition-all" id="dataset-filter-code">Code</button>
|
744 |
-
<button onclick="filterDatasets('language')" class="px-2 py-1 text-xs rounded-full tag-unselected transition-all" id="dataset-filter-language">Language</button>
|
745 |
-
</div>
|
746 |
-
|
747 |
-
<!-- Selected Dataset Tag -->
|
748 |
-
<div id="selectedDatasetTag" class="mb-3 min-h-[24px]">
|
749 |
-
<!-- Selected dataset tag will appear here -->
|
750 |
-
</div>
|
751 |
-
|
752 |
-
<!-- Dataset Selection Panel -->
|
753 |
-
<div id="datasetGrid" class="selection-panel space-y-2">
|
754 |
-
<!-- Datasets will be populated by JavaScript -->
|
755 |
-
</div>
|
756 |
-
</div>
|
757 |
-
|
758 |
-
<!-- Metrics & Config -->
|
759 |
-
<div class="bg-white rounded-xl shadow-lg p-4 card-hover">
|
760 |
-
<div class="flex items-center space-x-2 mb-4">
|
761 |
-
<i data-lucide="settings" class="w-5 h-5 text-purple-600"></i>
|
762 |
-
<h2 class="text-lg font-semibold text-gray-800">Config</h2>
|
763 |
-
</div>
|
764 |
-
|
765 |
-
<!-- Selected Metrics Tags -->
|
766 |
-
<div id="selectedMetricsTags" class="mb-3 min-h-[24px]">
|
767 |
-
<!-- Selected metrics tags will appear here -->
|
768 |
-
</div>
|
769 |
-
|
770 |
-
<!-- Metrics Selection -->
|
771 |
-
<div class="mb-4">
|
772 |
-
<label class="block text-sm font-medium text-gray-700 mb-2">Metrics</label>
|
773 |
-
<div id="metricsGrid" class="space-y-1">
|
774 |
-
<!-- Metrics will be populated by JavaScript -->
|
775 |
-
</div>
|
776 |
-
</div>
|
777 |
-
|
778 |
-
<!-- Parameters -->
|
779 |
-
<div class="space-y-3">
|
780 |
-
<div>
|
781 |
-
<label class="block text-xs font-medium text-gray-700 mb-1">Sample Size</label>
|
782 |
-
<input type="range" id="sampleSize" min="10" max="1000" value="50" step="10"
|
783 |
-
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
|
784 |
-
<div class="flex justify-between text-xs text-gray-500">
|
785 |
-
<span>10</span>
|
786 |
-
<span id="sampleSizeValue">50</span>
|
787 |
-
<span>1000</span>
|
788 |
-
</div>
|
789 |
-
</div>
|
790 |
-
|
791 |
-
<div>
|
792 |
-
<label class="block text-xs font-medium text-gray-700 mb-1">Temperature</label>
|
793 |
-
<input type="range" id="temperature" min="0" max="2" step="0.1" value="0.7"
|
794 |
-
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
|
795 |
-
<div class="flex justify-between text-xs text-gray-500">
|
796 |
-
<span>0.0</span>
|
797 |
-
<span id="temperatureValue">0.7</span>
|
798 |
-
<span>2.0</span>
|
799 |
-
</div>
|
800 |
-
</div>
|
801 |
-
</div>
|
802 |
-
|
803 |
-
<!-- Start Button -->
|
804 |
-
<button onclick="startEvaluation()" id="startBtn"
|
805 |
-
class="w-full gradient-bg text-white py-2 px-4 rounded-lg font-semibold hover:opacity-90 transition-opacity disabled:opacity-50 disabled:cursor-not-allowed mt-4 text-sm">
|
806 |
-
<i data-lucide="play" class="w-4 h-4 inline mr-1"></i>
|
807 |
-
Start NovaEval
|
808 |
-
</button>
|
809 |
-
</div>
|
810 |
-
</div>
|
811 |
-
|
812 |
-
<!-- Results Panel -->
|
813 |
-
<div id="resultsPanel" class="bg-white rounded-xl shadow-lg p-6 card-hover hidden">
|
814 |
-
<div class="flex items-center space-x-3 mb-4">
|
815 |
-
<i data-lucide="bar-chart" class="w-6 h-6 text-purple-600"></i>
|
816 |
-
<h2 class="text-xl font-semibold text-gray-800">NovaEval Results</h2>
|
817 |
-
</div>
|
818 |
-
|
819 |
-
<div id="resultsContent">
|
820 |
-
<!-- Results will be populated by JavaScript -->
|
821 |
-
</div>
|
822 |
-
</div>
|
823 |
-
</div>
|
824 |
-
|
825 |
-
<!-- Right Panel - Progress & Logs (1 column) -->
|
826 |
-
<div class="space-y-6">
|
827 |
-
<!-- Progress -->
|
828 |
-
<div class="bg-white rounded-xl shadow-lg p-4 card-hover">
|
829 |
-
<div class="flex items-center space-x-2 mb-3">
|
830 |
-
<i data-lucide="activity" class="w-5 h-5 text-purple-600"></i>
|
831 |
-
<h2 class="text-lg font-semibold text-gray-800">Progress</h2>
|
832 |
-
</div>
|
833 |
-
|
834 |
-
<div id="progressSection" class="hidden">
|
835 |
-
<div class="mb-3">
|
836 |
-
<div class="flex justify-between text-xs text-gray-600 mb-1">
|
837 |
-
<span id="currentStep">Initializing...</span>
|
838 |
-
<span id="progressPercent">0%</span>
|
839 |
-
</div>
|
840 |
-
<div class="w-full bg-gray-200 rounded-full h-2">
|
841 |
-
<div id="progressBar" class="bg-gradient-to-r from-purple-500 to-blue-500 h-2 rounded-full progress-bar" style="width: 0%"></div>
|
842 |
-
</div>
|
843 |
-
</div>
|
844 |
-
</div>
|
845 |
-
|
846 |
-
<div id="idleMessage" class="text-center text-gray-500 py-4">
|
847 |
-
<i data-lucide="clock" class="w-8 h-8 mx-auto mb-2 text-gray-300"></i>
|
848 |
-
<p class="text-sm">Ready to start NovaEval</p>
|
849 |
-
</div>
|
850 |
-
</div>
|
851 |
-
|
852 |
-
<!-- Live Logs -->
|
853 |
-
<div class="bg-white rounded-xl shadow-lg p-4 card-hover">
|
854 |
-
<div class="flex items-center space-x-2 mb-3">
|
855 |
-
<i data-lucide="terminal" class="w-5 h-5 text-purple-600"></i>
|
856 |
-
<h2 class="text-lg font-semibold text-gray-800">Live Logs</h2>
|
857 |
-
<span class="text-xs text-gray-500">(Requests & Responses)</span>
|
858 |
-
</div>
|
859 |
-
|
860 |
-
<div id="logsContainer" class="bg-gray-900 text-green-400 p-3 rounded-lg h-64 overflow-y-auto font-mono text-xs">
|
861 |
-
<div class="text-gray-500">Waiting for NovaEval to start...</div>
|
862 |
-
</div>
|
863 |
-
</div>
|
864 |
-
</div>
|
865 |
-
</div>
|
866 |
-
</div>
|
867 |
-
|
868 |
-
<script>
|
869 |
-
// Global state
|
870 |
-
let selectedModels = [];
|
871 |
-
let selectedDataset = null;
|
872 |
-
let selectedMetrics = [];
|
873 |
-
let websocket = null;
|
874 |
-
let currentEvaluationId = null;
|
875 |
-
|
876 |
-
// Models data
|
877 |
-
const models = """ + json.dumps(HF_MODELS) + """;
|
878 |
-
const datasets = """ + json.dumps(EVALUATION_DATASETS) + """;
|
879 |
-
const metrics = """ + json.dumps(EVALUATION_METRICS) + """;
|
880 |
-
|
881 |
-
// Initialize the application
|
882 |
-
document.addEventListener('DOMContentLoaded', function() {
|
883 |
-
lucide.createIcons();
|
884 |
-
renderModels();
|
885 |
-
renderDatasets();
|
886 |
-
renderMetrics();
|
887 |
-
setupEventListeners();
|
888 |
-
});
|
889 |
-
|
890 |
-
function setupEventListeners() {
|
891 |
-
// Sample size slider - Fixed to work properly
|
892 |
-
const sampleSizeSlider = document.getElementById('sampleSize');
|
893 |
-
const sampleSizeValue = document.getElementById('sampleSizeValue');
|
894 |
-
|
895 |
-
sampleSizeSlider.addEventListener('input', function() {
|
896 |
-
sampleSizeValue.textContent = this.value;
|
897 |
-
});
|
898 |
-
|
899 |
-
// Temperature slider
|
900 |
-
const temperatureSlider = document.getElementById('temperature');
|
901 |
-
const temperatureValue = document.getElementById('temperatureValue');
|
902 |
-
|
903 |
-
temperatureSlider.addEventListener('input', function() {
|
904 |
-
temperatureValue.textContent = this.value;
|
905 |
-
});
|
906 |
-
}
|
907 |
-
|
908 |
-
function renderModels() {
|
909 |
-
const grid = document.getElementById('modelGrid');
|
910 |
-
grid.innerHTML = '';
|
911 |
-
|
912 |
-
Object.keys(models).forEach(category => {
|
913 |
-
models[category].forEach(model => {
|
914 |
-
const modelCard = createModelCard(model, category);
|
915 |
-
grid.appendChild(modelCard);
|
916 |
-
});
|
917 |
-
});
|
918 |
-
}
|
919 |
-
|
920 |
-
function createModelCard(model, category) {
|
921 |
-
const div = document.createElement('div');
|
922 |
-
div.className = `model-card p-2 border rounded-lg cursor-pointer hover:shadow-md transition-all compact-card`;
|
923 |
-
div.dataset.category = category;
|
924 |
-
div.dataset.modelId = model.id;
|
925 |
-
|
926 |
-
div.innerHTML = `
|
927 |
-
<div class="flex items-start justify-between mb-1">
|
928 |
-
<div class="flex-1">
|
929 |
-
<h3 class="font-semibold text-gray-800 text-sm">${model.name}</h3>
|
930 |
-
<p class="text-xs text-gray-500">${model.provider}</p>
|
931 |
-
</div>
|
932 |
-
<div class="text-xs bg-gray-100 px-2 py-1 rounded">${model.size}</div>
|
933 |
-
</div>
|
934 |
-
<p class="text-xs text-gray-600 mb-2 line-clamp-2">${model.description}</p>
|
935 |
-
<div class="flex flex-wrap gap-1">
|
936 |
-
${model.capabilities.slice(0, 2).map(cap => `<span class="text-xs bg-purple-100 text-purple-700 px-1 py-0.5 rounded">${cap}</span>`).join('')}
|
937 |
-
</div>
|
938 |
-
`;
|
939 |
-
|
940 |
-
div.addEventListener('click', () => toggleModelSelection(model.id, model.name, div));
|
941 |
-
return div;
|
942 |
-
}
|
943 |
-
|
944 |
-
function toggleModelSelection(modelId, modelName, element) {
|
945 |
-
if (selectedModels.includes(modelId)) {
|
946 |
-
selectedModels = selectedModels.filter(id => id !== modelId);
|
947 |
-
element.classList.remove('ring-2', 'ring-purple-500', 'bg-purple-50');
|
948 |
-
} else {
|
949 |
-
selectedModels.push(modelId);
|
950 |
-
element.classList.add('ring-2', 'ring-purple-500', 'bg-purple-50');
|
951 |
-
}
|
952 |
-
updateSelectedModelsTags();
|
953 |
-
updateSelectedModelsCount();
|
954 |
-
}
|
955 |
-
|
956 |
-
function updateSelectedModelsTags() {
|
957 |
-
const container = document.getElementById('selectedModelsTags');
|
958 |
-
container.innerHTML = '';
|
959 |
-
|
960 |
-
selectedModels.forEach(modelId => {
|
961 |
-
const modelName = getModelName(modelId);
|
962 |
-
const tag = document.createElement('span');
|
963 |
-
tag.className = 'inline-flex items-center px-2 py-1 text-xs bg-purple-100 text-purple-800 rounded-full mr-1 mb-1';
|
964 |
-
tag.innerHTML = `
|
965 |
-
${modelName}
|
966 |
-
<button onclick="removeModel('${modelId}')" class="ml-1 text-purple-600 hover:text-purple-800">
|
967 |
-
<i data-lucide="x" class="w-3 h-3"></i>
|
968 |
-
</button>
|
969 |
-
`;
|
970 |
-
container.appendChild(tag);
|
971 |
-
});
|
972 |
-
lucide.createIcons();
|
973 |
-
}
|
974 |
-
|
975 |
-
function removeModel(modelId) {
|
976 |
-
selectedModels = selectedModels.filter(id => id !== modelId);
|
977 |
-
// Update UI
|
978 |
-
const modelCard = document.querySelector(`[data-model-id="${modelId}"]`);
|
979 |
-
if (modelCard) {
|
980 |
-
modelCard.classList.remove('ring-2', 'ring-purple-500', 'bg-purple-50');
|
981 |
-
}
|
982 |
-
updateSelectedModelsTags();
|
983 |
-
updateSelectedModelsCount();
|
984 |
-
}
|
985 |
-
|
986 |
-
function getModelName(modelId) {
|
987 |
-
for (const category of Object.values(models)) {
|
988 |
-
for (const model of category) {
|
989 |
-
if (model.id === modelId) {
|
990 |
-
return model.name;
|
991 |
-
}
|
992 |
-
}
|
993 |
-
}
|
994 |
-
return modelId.split('/').pop();
|
995 |
-
}
|
996 |
-
|
997 |
-
function updateSelectedModelsCount() {
|
998 |
-
document.getElementById('selectedModelsCount').textContent = `(${selectedModels.length})`;
|
999 |
-
}
|
1000 |
-
|
1001 |
-
function filterModels(category) {
|
1002 |
-
// Update filter buttons
|
1003 |
-
document.querySelectorAll('[id^="filter-"]').forEach(btn => {
|
1004 |
-
btn.className = btn.className.replace('tag-selected', 'tag-unselected');
|
1005 |
-
});
|
1006 |
-
document.getElementById(`filter-${category}`).className =
|
1007 |
-
document.getElementById(`filter-${category}`).className.replace('tag-unselected', 'tag-selected');
|
1008 |
-
|
1009 |
-
// Filter model cards
|
1010 |
-
document.querySelectorAll('.model-card').forEach(card => {
|
1011 |
-
if (category === 'all' || card.dataset.category === category) {
|
1012 |
-
card.style.display = 'block';
|
1013 |
-
} else {
|
1014 |
-
card.style.display = 'none';
|
1015 |
-
}
|
1016 |
-
});
|
1017 |
-
}
|
1018 |
-
|
1019 |
-
function renderDatasets() {
|
1020 |
-
const grid = document.getElementById('datasetGrid');
|
1021 |
-
grid.innerHTML = '';
|
1022 |
-
|
1023 |
-
Object.keys(datasets).forEach(category => {
|
1024 |
-
datasets[category].forEach(dataset => {
|
1025 |
-
const datasetCard = createDatasetCard(dataset, category);
|
1026 |
-
grid.appendChild(datasetCard);
|
1027 |
-
});
|
1028 |
-
});
|
1029 |
-
}
|
1030 |
-
|
1031 |
-
function createDatasetCard(dataset, category) {
|
1032 |
-
const div = document.createElement('div');
|
1033 |
-
div.className = `dataset-card p-2 border rounded-lg cursor-pointer hover:shadow-md transition-all compact-card`;
|
1034 |
-
div.dataset.category = category;
|
1035 |
-
div.dataset.datasetId = dataset.id;
|
1036 |
-
|
1037 |
-
div.innerHTML = `
|
1038 |
-
<div class="flex items-start justify-between mb-1">
|
1039 |
-
<div class="flex-1">
|
1040 |
-
<h3 class="font-semibold text-gray-800 text-sm">${dataset.name}</h3>
|
1041 |
-
<p class="text-xs text-gray-600 line-clamp-2">${dataset.description}</p>
|
1042 |
-
</div>
|
1043 |
-
<div class="text-xs bg-gray-100 px-1 py-0.5 rounded">${dataset.samples.toLocaleString()}</div>
|
1044 |
-
</div>
|
1045 |
-
<div class="flex justify-between items-center mt-2">
|
1046 |
-
<span class="text-xs bg-blue-100 text-blue-700 px-1 py-0.5 rounded">${dataset.task_type}</span>
|
1047 |
-
<span class="text-xs text-gray-500">${dataset.difficulty}</span>
|
1048 |
-
</div>
|
1049 |
-
`;
|
1050 |
-
|
1051 |
-
div.addEventListener('click', () => selectDataset(dataset.id, dataset.name, div));
|
1052 |
-
return div;
|
1053 |
-
}
|
1054 |
-
|
1055 |
-
function selectDataset(datasetId, datasetName, element) {
|
1056 |
-
// Remove previous selection
|
1057 |
-
document.querySelectorAll('.dataset-card').forEach(card => {
|
1058 |
-
card.classList.remove('ring-2', 'ring-purple-500', 'bg-purple-50');
|
1059 |
-
});
|
1060 |
-
|
1061 |
-
// Add selection to clicked element
|
1062 |
-
element.classList.add('ring-2', 'ring-purple-500', 'bg-purple-50');
|
1063 |
-
selectedDataset = datasetId;
|
1064 |
-
|
1065 |
-
// Update selected dataset tag
|
1066 |
-
updateSelectedDatasetTag(datasetName);
|
1067 |
-
}
|
1068 |
-
|
1069 |
-
function updateSelectedDatasetTag(datasetName) {
|
1070 |
-
const container = document.getElementById('selectedDatasetTag');
|
1071 |
-
container.innerHTML = `
|
1072 |
-
<span class="inline-flex items-center px-2 py-1 text-xs bg-blue-100 text-blue-800 rounded-full">
|
1073 |
-
${datasetName}
|
1074 |
-
<button onclick="removeDataset()" class="ml-1 text-blue-600 hover:text-blue-800">
|
1075 |
-
<i data-lucide="x" class="w-3 h-3"></i>
|
1076 |
-
</button>
|
1077 |
-
</span>
|
1078 |
-
`;
|
1079 |
-
lucide.createIcons();
|
1080 |
-
}
|
1081 |
-
|
1082 |
-
function removeDataset() {
|
1083 |
-
selectedDataset = null;
|
1084 |
-
document.getElementById('selectedDatasetTag').innerHTML = '';
|
1085 |
-
document.querySelectorAll('.dataset-card').forEach(card => {
|
1086 |
-
card.classList.remove('ring-2', 'ring-purple-500', 'bg-purple-50');
|
1087 |
-
});
|
1088 |
-
}
|
1089 |
-
|
1090 |
-
function filterDatasets(category) {
|
1091 |
-
// Update filter buttons
|
1092 |
-
document.querySelectorAll('[id^="dataset-filter-"]').forEach(btn => {
|
1093 |
-
btn.className = btn.className.replace('tag-selected', 'tag-unselected');
|
1094 |
-
});
|
1095 |
-
document.getElementById(`dataset-filter-${category}`).className =
|
1096 |
-
document.getElementById(`dataset-filter-${category}`).className.replace('tag-unselected', 'tag-selected');
|
1097 |
-
|
1098 |
-
// Filter dataset cards
|
1099 |
-
document.querySelectorAll('.dataset-card').forEach(card => {
|
1100 |
-
if (category === 'all' || card.dataset.category === category) {
|
1101 |
-
card.style.display = 'block';
|
1102 |
-
} else {
|
1103 |
-
card.style.display = 'none';
|
1104 |
-
}
|
1105 |
-
});
|
1106 |
-
}
|
1107 |
-
|
1108 |
-
function renderMetrics() {
|
1109 |
-
const grid = document.getElementById('metricsGrid');
|
1110 |
-
grid.innerHTML = '';
|
1111 |
-
|
1112 |
-
metrics.forEach(metric => {
|
1113 |
-
const div = document.createElement('div');
|
1114 |
-
div.className = 'flex items-center space-x-2';
|
1115 |
-
|
1116 |
-
div.innerHTML = `
|
1117 |
-
<input type="checkbox" id="metric-${metric.id}" class="rounded text-purple-600 focus:ring-purple-500">
|
1118 |
-
<label for="metric-${metric.id}" class="text-xs text-gray-700 cursor-pointer">${metric.name}</label>
|
1119 |
-
`;
|
1120 |
-
|
1121 |
-
const checkbox = div.querySelector('input');
|
1122 |
-
checkbox.addEventListener('change', () => {
|
1123 |
-
if (checkbox.checked) {
|
1124 |
-
selectedMetrics.push(metric.id);
|
1125 |
-
} else {
|
1126 |
-
selectedMetrics = selectedMetrics.filter(id => id !== metric.id);
|
1127 |
-
}
|
1128 |
-
updateSelectedMetricsTags();
|
1129 |
-
});
|
1130 |
-
|
1131 |
-
grid.appendChild(div);
|
1132 |
-
});
|
1133 |
-
}
|
1134 |
-
|
1135 |
-
function updateSelectedMetricsTags() {
|
1136 |
-
const container = document.getElementById('selectedMetricsTags');
|
1137 |
-
container.innerHTML = '';
|
1138 |
-
|
1139 |
-
selectedMetrics.forEach(metricId => {
|
1140 |
-
const metricName = getMetricName(metricId);
|
1141 |
-
const tag = document.createElement('span');
|
1142 |
-
tag.className = 'inline-flex items-center px-2 py-1 text-xs bg-green-100 text-green-800 rounded-full mr-1 mb-1';
|
1143 |
-
tag.innerHTML = `
|
1144 |
-
${metricName}
|
1145 |
-
<button onclick="removeMetric('${metricId}')" class="ml-1 text-green-600 hover:text-green-800">
|
1146 |
-
<i data-lucide="x" class="w-3 h-3"></i>
|
1147 |
-
</button>
|
1148 |
-
`;
|
1149 |
-
container.appendChild(tag);
|
1150 |
-
});
|
1151 |
-
lucide.createIcons();
|
1152 |
-
}
|
1153 |
-
|
1154 |
-
function removeMetric(metricId) {
|
1155 |
-
selectedMetrics = selectedMetrics.filter(id => id !== metricId);
|
1156 |
-
// Update checkbox
|
1157 |
-
const checkbox = document.getElementById(`metric-${metricId}`);
|
1158 |
-
if (checkbox) {
|
1159 |
-
checkbox.checked = false;
|
1160 |
-
}
|
1161 |
-
updateSelectedMetricsTags();
|
1162 |
-
}
|
1163 |
-
|
1164 |
-
function getMetricName(metricId) {
|
1165 |
-
const metric = metrics.find(m => m.id === metricId);
|
1166 |
-
return metric ? metric.name : metricId;
|
1167 |
-
}
|
1168 |
-
|
1169 |
-
function startEvaluation() {
|
1170 |
-
// Validation
|
1171 |
-
if (selectedModels.length === 0) {
|
1172 |
-
alert('Please select at least one model');
|
1173 |
-
return;
|
1174 |
-
}
|
1175 |
-
|
1176 |
-
if (!selectedDataset) {
|
1177 |
-
alert('Please select a dataset');
|
1178 |
-
return;
|
1179 |
-
}
|
1180 |
-
|
1181 |
-
if (selectedMetrics.length === 0) {
|
1182 |
-
alert('Please select at least one metric');
|
1183 |
-
return;
|
1184 |
-
}
|
1185 |
-
|
1186 |
-
// Prepare request
|
1187 |
-
const request = {
|
1188 |
-
models: selectedModels,
|
1189 |
-
dataset: selectedDataset,
|
1190 |
-
metrics: selectedMetrics,
|
1191 |
-
sample_size: parseInt(document.getElementById('sampleSize').value),
|
1192 |
-
temperature: parseFloat(document.getElementById('temperature').value),
|
1193 |
-
max_tokens: 512,
|
1194 |
-
top_p: 0.9
|
1195 |
-
};
|
1196 |
-
|
1197 |
-
// Start evaluation
|
1198 |
-
fetch('/api/evaluate', {
|
1199 |
-
method: 'POST',
|
1200 |
-
headers: {
|
1201 |
-
'Content-Type': 'application/json'
|
1202 |
-
},
|
1203 |
-
body: JSON.stringify(request)
|
1204 |
-
})
|
1205 |
-
.then(response => response.json())
|
1206 |
-
.then(data => {
|
1207 |
-
if (data.status === 'started') {
|
1208 |
-
currentEvaluationId = data.evaluation_id;
|
1209 |
-
connectWebSocket(data.evaluation_id);
|
1210 |
-
showProgress();
|
1211 |
-
disableStartButton();
|
1212 |
-
} else {
|
1213 |
-
alert('Failed to start NovaEval: ' + data.message);
|
1214 |
-
}
|
1215 |
-
})
|
1216 |
-
.catch(error => {
|
1217 |
-
console.error('Error:', error);
|
1218 |
-
alert('Failed to start NovaEval');
|
1219 |
-
});
|
1220 |
-
}
|
1221 |
-
|
1222 |
-
function connectWebSocket(evaluationId) {
|
1223 |
-
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
1224 |
-
const wsUrl = `${protocol}//${window.location.host}/ws/${evaluationId}`;
|
1225 |
-
|
1226 |
-
websocket = new WebSocket(wsUrl);
|
1227 |
-
|
1228 |
-
websocket.onmessage = function(event) {
|
1229 |
-
const data = JSON.parse(event.data);
|
1230 |
-
handleWebSocketMessage(data);
|
1231 |
-
};
|
1232 |
-
|
1233 |
-
websocket.onclose = function() {
|
1234 |
-
console.log('WebSocket connection closed');
|
1235 |
-
};
|
1236 |
-
|
1237 |
-
websocket.onerror = function(error) {
|
1238 |
-
console.error('WebSocket error:', error);
|
1239 |
-
};
|
1240 |
-
}
|
1241 |
-
|
1242 |
-
function handleWebSocketMessage(data) {
|
1243 |
-
switch (data.type) {
|
1244 |
-
case 'progress':
|
1245 |
-
updateProgress(data.progress, data.current_step);
|
1246 |
-
break;
|
1247 |
-
case 'log':
|
1248 |
-
addLogEntry(data);
|
1249 |
-
break;
|
1250 |
-
case 'complete':
|
1251 |
-
showResults(data.results);
|
1252 |
-
enableStartButton();
|
1253 |
-
break;
|
1254 |
-
case 'error':
|
1255 |
-
addLogEntry({
|
1256 |
-
level: 'ERROR',
|
1257 |
-
message: data.message,
|
1258 |
-
timestamp: new Date().toISOString()
|
1259 |
-
});
|
1260 |
-
enableStartButton();
|
1261 |
-
break;
|
1262 |
-
}
|
1263 |
-
}
|
1264 |
-
|
1265 |
-
function showProgress() {
|
1266 |
-
document.getElementById('idleMessage').classList.add('hidden');
|
1267 |
-
document.getElementById('progressSection').classList.remove('hidden');
|
1268 |
-
clearLogs();
|
1269 |
-
}
|
1270 |
-
|
1271 |
-
function updateProgress(progress, currentStep) {
|
1272 |
-
document.getElementById('progressBar').style.width = progress + '%';
|
1273 |
-
document.getElementById('progressPercent').textContent = Math.round(progress) + '%';
|
1274 |
-
document.getElementById('currentStep').textContent = currentStep;
|
1275 |
-
}
|
1276 |
-
|
1277 |
-
function addLogEntry(logData) {
|
1278 |
-
const container = document.getElementById('logsContainer');
|
1279 |
-
const entry = document.createElement('div');
|
1280 |
-
entry.className = 'log-entry mb-1';
|
1281 |
-
|
1282 |
-
const timestamp = new Date(logData.timestamp).toLocaleTimeString();
|
1283 |
-
const levelColor = {
|
1284 |
-
'INFO': 'text-blue-400',
|
1285 |
-
'SUCCESS': 'text-green-400',
|
1286 |
-
'ERROR': 'text-red-400',
|
1287 |
-
'DEBUG': 'text-yellow-400',
|
1288 |
-
'WARNING': 'text-orange-400'
|
1289 |
-
}[logData.level] || 'text-green-400';
|
1290 |
-
|
1291 |
-
entry.innerHTML = `
|
1292 |
-
<span class="text-gray-500">[${timestamp}]</span>
|
1293 |
-
<span class="${levelColor}">[${logData.level}]</span>
|
1294 |
-
<span>${logData.message}</span>
|
1295 |
-
`;
|
1296 |
-
|
1297 |
-
container.appendChild(entry);
|
1298 |
-
container.scrollTop = container.scrollHeight;
|
1299 |
-
}
|
1300 |
-
|
1301 |
-
function clearLogs() {
|
1302 |
-
document.getElementById('logsContainer').innerHTML = '';
|
1303 |
-
}
|
1304 |
-
|
1305 |
-
function showResults(results) {
|
1306 |
-
const panel = document.getElementById('resultsPanel');
|
1307 |
-
const content = document.getElementById('resultsContent');
|
1308 |
-
|
1309 |
-
let html = '<div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">';
|
1310 |
-
|
1311 |
-
// Show results for ALL selected models
|
1312 |
-
selectedModels.forEach(modelId => {
|
1313 |
-
const modelName = getModelName(modelId);
|
1314 |
-
const modelResults = results[modelId] || {};
|
1315 |
-
|
1316 |
-
html += `
|
1317 |
-
<div class="border rounded-lg p-4 bg-gray-50">
|
1318 |
-
<h3 class="font-semibold text-gray-800 mb-3">${modelName}</h3>
|
1319 |
-
<div class="space-y-2">
|
1320 |
-
`;
|
1321 |
-
|
1322 |
-
if (Object.keys(modelResults).length > 0) {
|
1323 |
-
Object.keys(modelResults).forEach(metric => {
|
1324 |
-
const value = modelResults[metric];
|
1325 |
-
html += `
|
1326 |
-
<div class="flex justify-between items-center">
|
1327 |
-
<span class="text-sm text-gray-600">${metric.toUpperCase()}</span>
|
1328 |
-
<span class="text-lg font-semibold text-gray-800">${value}</span>
|
1329 |
-
</div>
|
1330 |
-
`;
|
1331 |
-
});
|
1332 |
-
} else {
|
1333 |
-
html += '<div class="text-sm text-gray-500">No results available</div>';
|
1334 |
-
}
|
1335 |
-
|
1336 |
-
html += '</div></div>';
|
1337 |
-
});
|
1338 |
-
|
1339 |
-
html += '</div>';
|
1340 |
-
content.innerHTML = html;
|
1341 |
-
panel.classList.remove('hidden');
|
1342 |
-
}
|
1343 |
-
|
1344 |
-
function disableStartButton() {
|
1345 |
-
const btn = document.getElementById('startBtn');
|
1346 |
-
btn.disabled = true;
|
1347 |
-
btn.innerHTML = '<i data-lucide="loader" class="w-4 h-4 inline mr-1 animate-spin"></i>Running NovaEval...';
|
1348 |
-
lucide.createIcons();
|
1349 |
-
}
|
1350 |
-
|
1351 |
-
function enableStartButton() {
|
1352 |
-
const btn = document.getElementById('startBtn');
|
1353 |
-
btn.disabled = false;
|
1354 |
-
btn.innerHTML = '<i data-lucide="play" class="w-4 h-4 inline mr-1"></i>Start NovaEval';
|
1355 |
-
lucide.createIcons();
|
1356 |
-
}
|
1357 |
-
</script>
|
1358 |
-
</body>
|
1359 |
-
</html>
|
1360 |
-
"""
|
1361 |
-
|
1362 |
-
@app.get("/api/models")
|
1363 |
-
async def get_models():
|
1364 |
-
"""Get available models"""
|
1365 |
-
log_request("get_models", {})
|
1366 |
-
return {"models": HF_MODELS}
|
1367 |
-
|
1368 |
-
@app.get("/api/datasets")
|
1369 |
-
async def get_datasets():
|
1370 |
-
"""Get available datasets"""
|
1371 |
-
log_request("get_datasets", {})
|
1372 |
-
return {"datasets": EVALUATION_DATASETS}
|
1373 |
-
|
1374 |
-
@app.get("/api/metrics")
|
1375 |
-
async def get_metrics():
|
1376 |
-
"""Get available metrics"""
|
1377 |
-
log_request("get_metrics", {})
|
1378 |
-
return {"metrics": EVALUATION_METRICS}
|
1379 |
-
|
1380 |
-
@app.get("/api/logs")
|
1381 |
-
async def get_request_logs():
|
1382 |
-
"""Get recent request logs"""
|
1383 |
-
return {"logs": request_logs[-100:]} # Return last 100 logs
|
1384 |
-
|
1385 |
-
@app.post("/api/evaluate")
|
1386 |
-
async def start_evaluation(request: EvaluationRequest):
|
1387 |
-
"""Start a new NovaEval evaluation"""
|
1388 |
-
evaluation_id = str(uuid.uuid4())
|
1389 |
-
|
1390 |
-
log_request("start_evaluation", {
|
1391 |
-
"evaluation_id": evaluation_id,
|
1392 |
-
"request": request.dict()
|
1393 |
-
})
|
1394 |
-
|
1395 |
-
# Start evaluation in background
|
1396 |
-
asyncio.create_task(run_novaeval_evaluation(evaluation_id, request))
|
1397 |
-
|
1398 |
-
return EvaluationResponse(
|
1399 |
-
evaluation_id=evaluation_id,
|
1400 |
-
status="started",
|
1401 |
-
message="NovaEval evaluation started successfully"
|
1402 |
-
)
|
1403 |
-
|
1404 |
-
@app.get("/api/evaluation/{evaluation_id}")
|
1405 |
-
async def get_evaluation_status(evaluation_id: str):
|
1406 |
-
"""Get evaluation status"""
|
1407 |
-
if evaluation_id not in active_evaluations:
|
1408 |
-
raise HTTPException(status_code=404, detail="Evaluation not found")
|
1409 |
-
|
1410 |
-
log_request("get_evaluation_status", {"evaluation_id": evaluation_id})
|
1411 |
-
return active_evaluations[evaluation_id]
|
1412 |
-
|
1413 |
-
@app.websocket("/ws/{evaluation_id}")
|
1414 |
-
async def websocket_endpoint(websocket: WebSocket, evaluation_id: str):
|
1415 |
-
"""WebSocket endpoint for real-time updates"""
|
1416 |
-
await websocket.accept()
|
1417 |
-
websocket_connections[evaluation_id] = websocket
|
1418 |
-
|
1419 |
-
log_request("websocket_connect", {"evaluation_id": evaluation_id})
|
1420 |
-
|
1421 |
-
try:
|
1422 |
-
while True:
|
1423 |
-
# Keep connection alive
|
1424 |
-
await asyncio.sleep(1)
|
1425 |
-
except WebSocketDisconnect:
|
1426 |
-
if evaluation_id in websocket_connections:
|
1427 |
-
del websocket_connections[evaluation_id]
|
1428 |
-
log_request("websocket_disconnect", {"evaluation_id": evaluation_id})
|
1429 |
-
|
1430 |
-
@app.get("/api/health")
|
1431 |
-
async def health_check():
|
1432 |
-
"""Health check endpoint"""
|
1433 |
-
return {
|
1434 |
-
"status": "healthy",
|
1435 |
-
"timestamp": datetime.now().isoformat(),
|
1436 |
-
"service": "novaeval-platform",
|
1437 |
-
"version": "4.0.0",
|
1438 |
-
"framework": "NovaEval"
|
1439 |
-
}
|
1440 |
-
|
1441 |
-
if __name__ == "__main__":
|
1442 |
-
logger.info("Starting NovaEval Platform v4.0.0")
|
1443 |
-
logger.info("Framework: NovaEval")
|
1444 |
-
logger.info("Models: Hugging Face")
|
1445 |
-
logger.info("Features: Real evaluations, detailed logging, request/response tracking")
|
1446 |
-
uvicorn.run(app, host="0.0.0.0", port=7860)
|
1447 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fixed-novaeval-space.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b51898f4d5d22ec3dff47d34bc2a0e4a35be243938d1fefc505cf95fe8f96103
|
3 |
+
size 127518
|
novaeval-space-deployment.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3cb881e568838cea8305a35504ed324af2d936149f2438fa4e2aa8fa797e2920
|
3 |
+
size 24411
|
package.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "react-template",
|
3 |
+
"version": "0.1.0",
|
4 |
+
"private": true,
|
5 |
+
"dependencies": {
|
6 |
+
"@testing-library/dom": "^10.4.0",
|
7 |
+
"@testing-library/jest-dom": "^6.6.3",
|
8 |
+
"@testing-library/react": "^16.3.0",
|
9 |
+
"@testing-library/user-event": "^13.5.0",
|
10 |
+
"react": "^19.1.0",
|
11 |
+
"react-dom": "^19.1.0",
|
12 |
+
"react-scripts": "5.0.1",
|
13 |
+
"web-vitals": "^2.1.4"
|
14 |
+
},
|
15 |
+
"scripts": {
|
16 |
+
"start": "react-scripts start",
|
17 |
+
"build": "react-scripts build",
|
18 |
+
"test": "react-scripts test",
|
19 |
+
"eject": "react-scripts eject"
|
20 |
+
},
|
21 |
+
"eslintConfig": {
|
22 |
+
"extends": [
|
23 |
+
"react-app",
|
24 |
+
"react-app/jest"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
"browserslist": {
|
28 |
+
"production": [
|
29 |
+
">0.2%",
|
30 |
+
"not dead",
|
31 |
+
"not op_mini all"
|
32 |
+
],
|
33 |
+
"development": [
|
34 |
+
"last 1 chrome version",
|
35 |
+
"last 1 firefox version",
|
36 |
+
"last 1 safari version"
|
37 |
+
]
|
38 |
+
}
|
39 |
+
}
|
requirements.txt
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
fastapi==0.116.0
|
2 |
-
uvicorn==0.35.0
|
3 |
-
websockets==15.0.1
|
4 |
-
httpx==0.28.1
|
5 |
-
pydantic==2.11.7
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|