Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- Dockerfile +13 -0
- README.md +20 -455
- app.py +568 -0
- requirements.txt +3 -0
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY requirements.txt .
|
6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
7 |
+
|
8 |
+
COPY app.py .
|
9 |
+
|
10 |
+
EXPOSE 7860
|
11 |
+
|
12 |
+
CMD ["python", "app.py"]
|
13 |
+
|
README.md
CHANGED
@@ -1,466 +1,31 @@
|
|
1 |
---
|
2 |
-
title: NovaEval
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
pinned: false
|
8 |
-
|
9 |
-
|
10 |
-
license: apache-2.0
|
11 |
-
short_description: A comprehensive AI model evaluation framework.
|
12 |
---
|
13 |
-
# NovaEval by Noveum.ai
|
14 |
|
15 |
-
|
16 |
-
[](https://github.com/Noveum/NovaEval/actions/workflows/release.yml)
|
17 |
-
[](https://codecov.io/gh/Noveum/NovaEval)
|
18 |
-
[](https://badge.fury.io/py/novaeval)
|
19 |
-
[](https://www.python.org/downloads/)
|
20 |
-
[](https://opensource.org/licenses/Apache-2.0)
|
21 |
|
22 |
-
A comprehensive
|
23 |
|
24 |
-
##
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
> **We're looking for contributors!** See the [Contributing](#-contributing) section below for ways to help.
|
31 |
|
32 |
-
##
|
33 |
|
34 |
-
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
|
38 |
-
We're actively looking for contributors in these key areas:
|
39 |
-
|
40 |
-
- **🧪 Unit Tests**: Help us improve our test coverage (currently 23% overall, 90%+ for core modules)
|
41 |
-
- **📚 Examples**: Create real-world evaluation examples and use cases
|
42 |
-
- **📝 Guides & Notebooks**: Write evaluation guides and interactive Jupyter notebooks
|
43 |
-
- **📖 Documentation**: Improve API documentation and user guides
|
44 |
-
- **🔍 RAG Metrics**: Add more metrics specifically for Retrieval-Augmented Generation evaluation
|
45 |
-
- **🤖 Agent Evaluation**: Build frameworks for evaluating AI agents and multi-turn conversations
|
46 |
-
|
47 |
-
### 🚀 Getting Started as a Contributor
|
48 |
-
|
49 |
-
1. **Start Small**: Pick up issues labeled `good first issue` or `help wanted`
|
50 |
-
2. **Join Discussions**: Share your ideas in [GitHub Discussions](https://github.com/Noveum/NovaEval/discussions)
|
51 |
-
3. **Review Code**: Help review pull requests and provide feedback
|
52 |
-
4. **Report Issues**: Found a bug? Report it in [GitHub Issues](https://github.com/Noveum/NovaEval/issues)
|
53 |
-
5. **Spread the Word**: Star the repository and share with your network
|
54 |
-
|
55 |
-
## 🚀 Features
|
56 |
-
|
57 |
-
- **Multi-Model Support**: Evaluate models from OpenAI, Anthropic, AWS Bedrock, and custom providers
|
58 |
-
- **Extensible Scoring**: Built-in scorers for accuracy, semantic similarity, code evaluation, and custom metrics
|
59 |
-
- **Dataset Integration**: Support for MMLU, HuggingFace datasets, custom datasets, and more
|
60 |
-
- **Production Ready**: Docker support, Kubernetes deployment, and cloud integrations
|
61 |
-
- **Comprehensive Reporting**: Detailed evaluation reports, artifacts, and visualizations
|
62 |
-
- **Secure**: Built-in credential management and secret store integration
|
63 |
-
- **Scalable**: Designed for both local testing and large-scale production evaluations
|
64 |
-
- **Cross-Platform**: Tested on macOS, Linux, and Windows with comprehensive CI/CD
|
65 |
-
|
66 |
-
## 📦 Installation
|
67 |
-
|
68 |
-
### From PyPI (Recommended)
|
69 |
-
|
70 |
-
```bash
|
71 |
-
pip install novaeval
|
72 |
-
```
|
73 |
-
|
74 |
-
### From Source
|
75 |
-
|
76 |
-
```bash
|
77 |
-
git clone https://github.com/Noveum/NovaEval.git
|
78 |
-
cd NovaEval
|
79 |
-
pip install -e .
|
80 |
-
```
|
81 |
-
|
82 |
-
### Docker
|
83 |
-
|
84 |
-
```bash
|
85 |
-
docker pull noveum/novaeval:latest
|
86 |
-
```
|
87 |
-
|
88 |
-
## 🏃♂️ Quick Start
|
89 |
-
|
90 |
-
### Basic Evaluation
|
91 |
-
|
92 |
-
```python
|
93 |
-
from novaeval import Evaluator
|
94 |
-
from novaeval.datasets import MMLUDataset
|
95 |
-
from novaeval.models import OpenAIModel
|
96 |
-
from novaeval.scorers import AccuracyScorer
|
97 |
-
|
98 |
-
# Configure for cost-conscious evaluation
|
99 |
-
MAX_TOKENS = 100 # Adjust based on budget: 5-10 for answers, 100+ for reasoning
|
100 |
-
|
101 |
-
# Initialize components
|
102 |
-
dataset = MMLUDataset(
|
103 |
-
subset="elementary_mathematics", # Easier subset for demo
|
104 |
-
num_samples=10,
|
105 |
-
split="test"
|
106 |
-
)
|
107 |
-
|
108 |
-
model = OpenAIModel(
|
109 |
-
model_name="gpt-4o-mini", # Cost-effective model
|
110 |
-
temperature=0.0,
|
111 |
-
max_tokens=MAX_TOKENS
|
112 |
-
)
|
113 |
-
|
114 |
-
scorer = AccuracyScorer(extract_answer=True)
|
115 |
-
|
116 |
-
# Create and run evaluation
|
117 |
-
evaluator = Evaluator(
|
118 |
-
dataset=dataset,
|
119 |
-
models=[model],
|
120 |
-
scorers=[scorer],
|
121 |
-
output_dir="./results"
|
122 |
-
)
|
123 |
-
|
124 |
-
results = evaluator.run()
|
125 |
-
|
126 |
-
# Display detailed results
|
127 |
-
for model_name, model_results in results["model_results"].items():
|
128 |
-
for scorer_name, score_info in model_results["scores"].items():
|
129 |
-
if isinstance(score_info, dict):
|
130 |
-
mean_score = score_info.get("mean", 0)
|
131 |
-
count = score_info.get("count", 0)
|
132 |
-
print(f"{scorer_name}: {mean_score:.4f} ({count} samples)")
|
133 |
-
```
|
134 |
-
|
135 |
-
### Configuration-Based Evaluation
|
136 |
-
|
137 |
-
```python
|
138 |
-
from novaeval import Evaluator
|
139 |
-
|
140 |
-
# Load configuration from YAML/JSON
|
141 |
-
evaluator = Evaluator.from_config("evaluation_config.yaml")
|
142 |
-
results = evaluator.run()
|
143 |
-
```
|
144 |
-
|
145 |
-
### Command Line Interface
|
146 |
-
|
147 |
-
NovaEval provides a comprehensive CLI for running evaluations:
|
148 |
-
|
149 |
-
```bash
|
150 |
-
# Run evaluation from configuration file
|
151 |
-
novaeval run config.yaml
|
152 |
-
|
153 |
-
# Quick evaluation with minimal setup
|
154 |
-
novaeval quick -d mmlu -m gpt-4 -s accuracy
|
155 |
-
|
156 |
-
# List available datasets, models, and scorers
|
157 |
-
novaeval list-datasets
|
158 |
-
novaeval list-models
|
159 |
-
novaeval list-scorers
|
160 |
-
|
161 |
-
# Generate sample configuration
|
162 |
-
novaeval generate-config sample-config.yaml
|
163 |
-
```
|
164 |
-
|
165 |
-
📖 **[Complete CLI Reference](docs/cli-reference.md)** - Detailed documentation for all CLI commands and options
|
166 |
-
|
167 |
-
### Example Configuration
|
168 |
-
|
169 |
-
```yaml
|
170 |
-
# evaluation_config.yaml
|
171 |
-
dataset:
|
172 |
-
type: "mmlu"
|
173 |
-
subset: "abstract_algebra"
|
174 |
-
num_samples: 500
|
175 |
-
|
176 |
-
models:
|
177 |
-
- type: "openai"
|
178 |
-
model_name: "gpt-4"
|
179 |
-
temperature: 0.0
|
180 |
-
- type: "anthropic"
|
181 |
-
model_name: "claude-3-opus"
|
182 |
-
temperature: 0.0
|
183 |
-
|
184 |
-
scorers:
|
185 |
-
- type: "accuracy"
|
186 |
-
- type: "semantic_similarity"
|
187 |
-
threshold: 0.8
|
188 |
-
|
189 |
-
output:
|
190 |
-
directory: "./results"
|
191 |
-
formats: ["json", "csv", "html"]
|
192 |
-
upload_to_s3: true
|
193 |
-
s3_bucket: "my-eval-results"
|
194 |
-
```
|
195 |
-
|
196 |
-
## 🏗️ Architecture
|
197 |
-
|
198 |
-
NovaEval is built with extensibility and modularity in mind:
|
199 |
-
|
200 |
-
```
|
201 |
-
src/novaeval/
|
202 |
-
├── datasets/ # Dataset loaders and processors
|
203 |
-
├── evaluators/ # Core evaluation logic
|
204 |
-
├── integrations/ # External service integrations
|
205 |
-
├── models/ # Model interfaces and adapters
|
206 |
-
├── reporting/ # Report generation and visualization
|
207 |
-
├── scorers/ # Scoring mechanisms and metrics
|
208 |
-
└── utils/ # Utility functions and helpers
|
209 |
-
```
|
210 |
-
|
211 |
-
### Core Components
|
212 |
-
|
213 |
-
- **Datasets**: Standardized interface for loading evaluation datasets
|
214 |
-
- **Models**: Unified API for different AI model providers
|
215 |
-
- **Scorers**: Pluggable scoring mechanisms for various evaluation metrics
|
216 |
-
- **Evaluators**: Orchestrates the evaluation process
|
217 |
-
- **Reporting**: Generates comprehensive reports and artifacts
|
218 |
-
- **Integrations**: Handles external services (S3, credential stores, etc.)
|
219 |
-
|
220 |
-
## 📊 Supported Datasets
|
221 |
-
|
222 |
-
- **MMLU**: Massive Multitask Language Understanding
|
223 |
-
- **HuggingFace**: Any dataset from the HuggingFace Hub
|
224 |
-
- **Custom**: JSON, CSV, or programmatic dataset definitions
|
225 |
-
- **Code Evaluation**: Programming benchmarks and code generation tasks
|
226 |
-
- **Agent Traces**: Multi-turn conversation and agent evaluation
|
227 |
-
|
228 |
-
## 🤖 Supported Models
|
229 |
-
|
230 |
-
- **OpenAI**: GPT-3.5, GPT-4, and newer models
|
231 |
-
- **Anthropic**: Claude family models
|
232 |
-
- **AWS Bedrock**: Amazon's managed AI services
|
233 |
-
- **Noveum AI Gateway**: Integration with Noveum's model gateway
|
234 |
-
- **Custom**: Extensible interface for any API-based model
|
235 |
-
|
236 |
-
## 📏 Built-in Scorers
|
237 |
-
|
238 |
-
### Accuracy-Based
|
239 |
-
- **ExactMatch**: Exact string matching
|
240 |
-
- **Accuracy**: Classification accuracy
|
241 |
-
- **F1Score**: F1 score for classification tasks
|
242 |
-
|
243 |
-
### Semantic-Based
|
244 |
-
- **SemanticSimilarity**: Embedding-based similarity scoring
|
245 |
-
- **BERTScore**: BERT-based semantic evaluation
|
246 |
-
- **RougeScore**: ROUGE metrics for text generation
|
247 |
-
|
248 |
-
### Code-Specific
|
249 |
-
- **CodeExecution**: Execute and validate code outputs
|
250 |
-
- **SyntaxChecker**: Validate code syntax
|
251 |
-
- **TestCoverage**: Code coverage analysis
|
252 |
-
|
253 |
-
### Custom
|
254 |
-
- **LLMJudge**: Use another LLM as a judge
|
255 |
-
- **HumanEval**: Integration with human evaluation workflows
|
256 |
-
|
257 |
-
## 🚀 Deployment
|
258 |
-
|
259 |
-
### Local Development
|
260 |
-
|
261 |
-
```bash
|
262 |
-
# Install dependencies
|
263 |
-
pip install -e ".[dev]"
|
264 |
-
|
265 |
-
# Run tests
|
266 |
-
pytest
|
267 |
-
|
268 |
-
# Run example evaluation
|
269 |
-
python examples/basic_evaluation.py
|
270 |
-
```
|
271 |
-
|
272 |
-
### Docker
|
273 |
-
|
274 |
-
```bash
|
275 |
-
# Build image
|
276 |
-
docker build -t nova-eval .
|
277 |
-
|
278 |
-
# Run evaluation
|
279 |
-
docker run -v $(pwd)/config:/config -v $(pwd)/results:/results nova-eval --config /config/eval.yaml
|
280 |
-
```
|
281 |
-
|
282 |
-
### Kubernetes
|
283 |
-
|
284 |
-
```bash
|
285 |
-
# Deploy to Kubernetes
|
286 |
-
kubectl apply -f kubernetes/
|
287 |
-
|
288 |
-
# Check status
|
289 |
-
kubectl get pods -l app=nova-eval
|
290 |
-
```
|
291 |
-
|
292 |
-
## 🔧 Configuration
|
293 |
-
|
294 |
-
NovaEval supports configuration through:
|
295 |
-
|
296 |
-
- **YAML/JSON files**: Declarative configuration
|
297 |
-
- **Environment variables**: Runtime configuration
|
298 |
-
- **Python code**: Programmatic configuration
|
299 |
-
- **CLI arguments**: Command-line overrides
|
300 |
-
|
301 |
-
### Environment Variables
|
302 |
-
|
303 |
-
```bash
|
304 |
-
export NOVA_EVAL_OUTPUT_DIR="./results"
|
305 |
-
export NOVA_EVAL_LOG_LEVEL="INFO"
|
306 |
-
export OPENAI_API_KEY="your-api-key"
|
307 |
-
export AWS_ACCESS_KEY_ID="your-aws-key"
|
308 |
-
```
|
309 |
-
|
310 |
-
### CI/CD Integration
|
311 |
-
|
312 |
-
NovaEval includes optimized GitHub Actions workflows:
|
313 |
-
- **Unit tests** run on all PRs and pushes for quick feedback
|
314 |
-
- **Integration tests** run on main branch only to minimize API costs
|
315 |
-
- **Cross-platform testing** on macOS, Linux, and Windows
|
316 |
-
|
317 |
-
## 📈 Reporting and Artifacts
|
318 |
-
|
319 |
-
NovaEval generates comprehensive evaluation reports:
|
320 |
-
|
321 |
-
- **Summary Reports**: High-level metrics and insights
|
322 |
-
- **Detailed Results**: Per-sample predictions and scores
|
323 |
-
- **Visualizations**: Charts and graphs for result analysis
|
324 |
-
- **Artifacts**: Model outputs, intermediate results, and debug information
|
325 |
-
- **Export Formats**: JSON, CSV, HTML, PDF
|
326 |
-
|
327 |
-
### Example Report Structure
|
328 |
-
|
329 |
-
```
|
330 |
-
results/
|
331 |
-
├── summary.json # High-level metrics
|
332 |
-
├── detailed_results.csv # Per-sample results
|
333 |
-
├── artifacts/
|
334 |
-
│ ├── model_outputs/ # Raw model responses
|
335 |
-
│ ├── intermediate/ # Processing artifacts
|
336 |
-
│ └── debug/ # Debug information
|
337 |
-
├── visualizations/
|
338 |
-
│ ├── accuracy_by_category.png
|
339 |
-
│ ├── score_distribution.png
|
340 |
-
│ └── confusion_matrix.png
|
341 |
-
└── report.html # Interactive HTML report
|
342 |
-
```
|
343 |
-
|
344 |
-
## 🔌 Extending NovaEval
|
345 |
-
|
346 |
-
### Custom Datasets
|
347 |
-
|
348 |
-
```python
|
349 |
-
from novaeval.datasets import BaseDataset
|
350 |
-
|
351 |
-
class MyCustomDataset(BaseDataset):
|
352 |
-
def load_data(self):
|
353 |
-
# Implement data loading logic
|
354 |
-
return samples
|
355 |
-
|
356 |
-
def get_sample(self, index):
|
357 |
-
# Return individual sample
|
358 |
-
return sample
|
359 |
-
```
|
360 |
-
|
361 |
-
### Custom Scorers
|
362 |
-
|
363 |
-
```python
|
364 |
-
from novaeval.scorers import BaseScorer
|
365 |
-
|
366 |
-
class MyCustomScorer(BaseScorer):
|
367 |
-
def score(self, prediction, ground_truth, context=None):
|
368 |
-
# Implement scoring logic
|
369 |
-
return score
|
370 |
-
```
|
371 |
-
|
372 |
-
### Custom Models
|
373 |
-
|
374 |
-
```python
|
375 |
-
from novaeval.models import BaseModel
|
376 |
-
|
377 |
-
class MyCustomModel(BaseModel):
|
378 |
-
def generate(self, prompt, **kwargs):
|
379 |
-
# Implement model inference
|
380 |
-
return response
|
381 |
-
```
|
382 |
-
|
383 |
-
## 🤝 Contributing
|
384 |
-
|
385 |
-
We welcome contributions! NovaEval is actively seeking contributors to help build a robust AI evaluation framework. Please see our [Contributing Guide](CONTRIBUTING.md) for detailed guidelines.
|
386 |
-
|
387 |
-
### 🎯 Priority Contribution Areas
|
388 |
-
|
389 |
-
As mentioned in the [We Need Your Help](#-we-need-your-help) section, we're particularly looking for help with:
|
390 |
-
|
391 |
-
1. **Unit Tests** - Expand test coverage beyond the current 23%
|
392 |
-
2. **Examples** - Real-world evaluation scenarios and use cases
|
393 |
-
3. **Guides & Notebooks** - Interactive evaluation tutorials
|
394 |
-
4. **Documentation** - API docs, user guides, and tutorials
|
395 |
-
5. **RAG Metrics** - Specialized metrics for retrieval-augmented generation
|
396 |
-
6. **Agent Evaluation** - Frameworks for multi-turn and agent-based evaluations
|
397 |
-
|
398 |
-
### Development Setup
|
399 |
-
|
400 |
-
```bash
|
401 |
-
# Clone repository
|
402 |
-
git clone https://github.com/Noveum/NovaEval.git
|
403 |
-
cd NovaEval
|
404 |
-
|
405 |
-
# Create virtual environment
|
406 |
-
python -m venv venv
|
407 |
-
source venv/bin/activate # On Windows: venv\Scripts\activate
|
408 |
-
|
409 |
-
# Install development dependencies
|
410 |
-
pip install -e ".[dev]"
|
411 |
-
|
412 |
-
# Install pre-commit hooks
|
413 |
-
pre-commit install
|
414 |
-
|
415 |
-
# Run tests
|
416 |
-
pytest
|
417 |
-
|
418 |
-
# Run with coverage
|
419 |
-
pytest --cov=src/novaeval --cov-report=html
|
420 |
-
```
|
421 |
-
|
422 |
-
### 🏗️ Contribution Workflow
|
423 |
-
|
424 |
-
1. **Fork** the repository
|
425 |
-
2. **Create** a feature branch (`git checkout -b feature/amazing-feature`)
|
426 |
-
3. **Make** your changes following our coding standards
|
427 |
-
4. **Add** tests for your changes
|
428 |
-
5. **Commit** your changes (`git commit -m 'Add amazing feature'`)
|
429 |
-
6. **Push** to the branch (`git push origin feature/amazing-feature`)
|
430 |
-
7. **Open** a Pull Request
|
431 |
-
|
432 |
-
### 📋 Contribution Guidelines
|
433 |
-
|
434 |
-
- **Code Quality**: Follow PEP 8 and use the provided pre-commit hooks
|
435 |
-
- **Testing**: Add unit tests for new features and bug fixes
|
436 |
-
- **Documentation**: Update documentation for API changes
|
437 |
-
- **Commit Messages**: Use conventional commit format
|
438 |
-
- **Issues**: Reference relevant issues in your PR description
|
439 |
-
|
440 |
-
### 🎉 Recognition
|
441 |
-
|
442 |
-
Contributors will be:
|
443 |
-
- Listed in our contributors page
|
444 |
-
- Mentioned in release notes for significant contributions
|
445 |
-
- Invited to join our contributor Discord community
|
446 |
-
|
447 |
-
## 📄 License
|
448 |
-
|
449 |
-
This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
|
450 |
-
|
451 |
-
## 🙏 Acknowledgments
|
452 |
-
|
453 |
-
- Inspired by evaluation frameworks like DeepEval, Confident AI, and Braintrust
|
454 |
-
- Built with modern Python best practices and industry standards
|
455 |
-
- Designed for the AI evaluation community
|
456 |
-
|
457 |
-
## 📞 Support
|
458 |
-
|
459 |
-
- **Documentation**: [https://noveum.github.io/NovaEval](https://noveum.github.io/NovaEval)
|
460 |
-
- **Issues**: [GitHub Issues](https://github.com/Noveum/NovaEval/issues)
|
461 |
-
- **Discussions**: [GitHub Discussions](https://github.com/Noveum/NovaEval/discussions)
|
462 |
-
- **Email**: [email protected]
|
463 |
-
|
464 |
-
---
|
465 |
-
|
466 |
-
Made with ❤️ by the Noveum.ai team
|
|
|
1 |
---
|
2 |
+
title: NovaEval - AI Model Evaluation Platform
|
3 |
+
emoji: 🧪
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
+
sdk: docker
|
7 |
pinned: false
|
8 |
+
license: mit
|
9 |
+
app_port: 7860
|
|
|
|
|
10 |
---
|
|
|
11 |
|
12 |
+
# NovaEval - AI Model Evaluation Platform
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
A comprehensive evaluation platform for AI models, powered by the NovaEval framework.
|
15 |
|
16 |
+
## Features
|
17 |
|
18 |
+
- 🤗 Hugging Face model integration
|
19 |
+
- 📊 Multiple evaluation metrics
|
20 |
+
- ⚡ Real-time progress tracking
|
21 |
+
- 📱 Mobile-friendly interface
|
|
|
22 |
|
23 |
+
## Quick Start
|
24 |
|
25 |
+
1. Select models from Hugging Face
|
26 |
+
2. Choose evaluation dataset
|
27 |
+
3. Pick metrics to compute
|
28 |
+
4. Run evaluation and view results
|
29 |
|
30 |
+
Powered by [NovaEval](https://github.com/Noveum/NovaEval) and [Hugging Face](https://huggingface.co).
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,568 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
NovaEval Space - Minimal Guaranteed-to-Work Version
|
3 |
+
Single file approach with embedded HTML/CSS/JS
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import uvicorn
|
8 |
+
from fastapi import FastAPI
|
9 |
+
from fastapi.responses import HTMLResponse
|
10 |
+
import logging
|
11 |
+
|
12 |
+
# Setup logging
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
# Create FastAPI app
|
17 |
+
app = FastAPI(title="NovaEval - AI Model Evaluation Platform")
|
18 |
+
|
19 |
+
# Embedded HTML with CSS and JavaScript
|
20 |
+
HTML_CONTENT = """
|
21 |
+
<!DOCTYPE html>
|
22 |
+
<html lang="en">
|
23 |
+
<head>
|
24 |
+
<meta charset="UTF-8">
|
25 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
26 |
+
<title>NovaEval - AI Model Evaluation Platform</title>
|
27 |
+
<style>
|
28 |
+
* {
|
29 |
+
margin: 0;
|
30 |
+
padding: 0;
|
31 |
+
box-sizing: border-box;
|
32 |
+
}
|
33 |
+
|
34 |
+
body {
|
35 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
36 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
37 |
+
min-height: 100vh;
|
38 |
+
color: #333;
|
39 |
+
}
|
40 |
+
|
41 |
+
.container {
|
42 |
+
max-width: 1200px;
|
43 |
+
margin: 0 auto;
|
44 |
+
padding: 20px;
|
45 |
+
}
|
46 |
+
|
47 |
+
.header {
|
48 |
+
background: rgba(255, 255, 255, 0.95);
|
49 |
+
backdrop-filter: blur(10px);
|
50 |
+
border-radius: 20px;
|
51 |
+
padding: 30px;
|
52 |
+
margin-bottom: 30px;
|
53 |
+
text-align: center;
|
54 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
|
55 |
+
}
|
56 |
+
|
57 |
+
.header h1 {
|
58 |
+
font-size: 3rem;
|
59 |
+
background: linear-gradient(135deg, #667eea, #764ba2);
|
60 |
+
-webkit-background-clip: text;
|
61 |
+
-webkit-text-fill-color: transparent;
|
62 |
+
margin-bottom: 10px;
|
63 |
+
}
|
64 |
+
|
65 |
+
.header p {
|
66 |
+
font-size: 1.2rem;
|
67 |
+
color: #666;
|
68 |
+
margin-bottom: 20px;
|
69 |
+
}
|
70 |
+
|
71 |
+
.status {
|
72 |
+
display: inline-flex;
|
73 |
+
align-items: center;
|
74 |
+
background: #10b981;
|
75 |
+
color: white;
|
76 |
+
padding: 8px 16px;
|
77 |
+
border-radius: 20px;
|
78 |
+
font-size: 0.9rem;
|
79 |
+
font-weight: 500;
|
80 |
+
}
|
81 |
+
|
82 |
+
.status::before {
|
83 |
+
content: "⚡";
|
84 |
+
margin-right: 8px;
|
85 |
+
}
|
86 |
+
|
87 |
+
.main-content {
|
88 |
+
display: grid;
|
89 |
+
grid-template-columns: repeat(auto-fit, minmax(350px, 1fr));
|
90 |
+
gap: 30px;
|
91 |
+
margin-bottom: 30px;
|
92 |
+
}
|
93 |
+
|
94 |
+
.card {
|
95 |
+
background: rgba(255, 255, 255, 0.95);
|
96 |
+
backdrop-filter: blur(10px);
|
97 |
+
border-radius: 20px;
|
98 |
+
padding: 30px;
|
99 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
|
100 |
+
transition: transform 0.3s ease, box-shadow 0.3s ease;
|
101 |
+
}
|
102 |
+
|
103 |
+
.card:hover {
|
104 |
+
transform: translateY(-5px);
|
105 |
+
box-shadow: 0 12px 40px rgba(0, 0, 0, 0.15);
|
106 |
+
}
|
107 |
+
|
108 |
+
.card h3 {
|
109 |
+
font-size: 1.5rem;
|
110 |
+
margin-bottom: 15px;
|
111 |
+
color: #333;
|
112 |
+
}
|
113 |
+
|
114 |
+
.card p {
|
115 |
+
color: #666;
|
116 |
+
line-height: 1.6;
|
117 |
+
margin-bottom: 20px;
|
118 |
+
}
|
119 |
+
|
120 |
+
.feature-list {
|
121 |
+
list-style: none;
|
122 |
+
}
|
123 |
+
|
124 |
+
.feature-list li {
|
125 |
+
padding: 8px 0;
|
126 |
+
color: #555;
|
127 |
+
}
|
128 |
+
|
129 |
+
.feature-list li::before {
|
130 |
+
content: "✓";
|
131 |
+
color: #10b981;
|
132 |
+
font-weight: bold;
|
133 |
+
margin-right: 10px;
|
134 |
+
}
|
135 |
+
|
136 |
+
.demo-section {
|
137 |
+
background: rgba(255, 255, 255, 0.95);
|
138 |
+
backdrop-filter: blur(10px);
|
139 |
+
border-radius: 20px;
|
140 |
+
padding: 30px;
|
141 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
|
142 |
+
margin-bottom: 30px;
|
143 |
+
}
|
144 |
+
|
145 |
+
.demo-controls {
|
146 |
+
display: grid;
|
147 |
+
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
148 |
+
gap: 20px;
|
149 |
+
margin-bottom: 30px;
|
150 |
+
}
|
151 |
+
|
152 |
+
.control-group {
|
153 |
+
background: #f8fafc;
|
154 |
+
padding: 20px;
|
155 |
+
border-radius: 12px;
|
156 |
+
border: 2px solid #e2e8f0;
|
157 |
+
}
|
158 |
+
|
159 |
+
.control-group h4 {
|
160 |
+
margin-bottom: 15px;
|
161 |
+
color: #334155;
|
162 |
+
}
|
163 |
+
|
164 |
+
.model-option, .dataset-option, .metric-option {
|
165 |
+
display: block;
|
166 |
+
width: 100%;
|
167 |
+
padding: 12px;
|
168 |
+
margin: 8px 0;
|
169 |
+
background: white;
|
170 |
+
border: 2px solid #e2e8f0;
|
171 |
+
border-radius: 8px;
|
172 |
+
cursor: pointer;
|
173 |
+
transition: all 0.2s ease;
|
174 |
+
}
|
175 |
+
|
176 |
+
.model-option:hover, .dataset-option:hover, .metric-option:hover {
|
177 |
+
border-color: #667eea;
|
178 |
+
background: #f0f4ff;
|
179 |
+
}
|
180 |
+
|
181 |
+
.model-option.selected, .dataset-option.selected, .metric-option.selected {
|
182 |
+
border-color: #667eea;
|
183 |
+
background: #667eea;
|
184 |
+
color: white;
|
185 |
+
}
|
186 |
+
|
187 |
+
.start-btn {
|
188 |
+
background: linear-gradient(135deg, #667eea, #764ba2);
|
189 |
+
color: white;
|
190 |
+
border: none;
|
191 |
+
padding: 15px 30px;
|
192 |
+
border-radius: 12px;
|
193 |
+
font-size: 1.1rem;
|
194 |
+
font-weight: 600;
|
195 |
+
cursor: pointer;
|
196 |
+
transition: all 0.3s ease;
|
197 |
+
width: 100%;
|
198 |
+
margin-top: 20px;
|
199 |
+
}
|
200 |
+
|
201 |
+
.start-btn:hover {
|
202 |
+
transform: translateY(-2px);
|
203 |
+
box-shadow: 0 8px 25px rgba(102, 126, 234, 0.4);
|
204 |
+
}
|
205 |
+
|
206 |
+
.start-btn:disabled {
|
207 |
+
opacity: 0.6;
|
208 |
+
cursor: not-allowed;
|
209 |
+
transform: none;
|
210 |
+
}
|
211 |
+
|
212 |
+
.progress-section {
|
213 |
+
background: rgba(255, 255, 255, 0.95);
|
214 |
+
backdrop-filter: blur(10px);
|
215 |
+
border-radius: 20px;
|
216 |
+
padding: 30px;
|
217 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
|
218 |
+
margin-top: 20px;
|
219 |
+
display: none;
|
220 |
+
}
|
221 |
+
|
222 |
+
.progress-bar {
|
223 |
+
width: 100%;
|
224 |
+
height: 20px;
|
225 |
+
background: #e2e8f0;
|
226 |
+
border-radius: 10px;
|
227 |
+
overflow: hidden;
|
228 |
+
margin: 15px 0;
|
229 |
+
}
|
230 |
+
|
231 |
+
.progress-fill {
|
232 |
+
height: 100%;
|
233 |
+
background: linear-gradient(90deg, #10b981, #059669);
|
234 |
+
width: 0%;
|
235 |
+
transition: width 0.5s ease;
|
236 |
+
}
|
237 |
+
|
238 |
+
.results-section {
|
239 |
+
background: rgba(255, 255, 255, 0.95);
|
240 |
+
backdrop-filter: blur(10px);
|
241 |
+
border-radius: 20px;
|
242 |
+
padding: 30px;
|
243 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
|
244 |
+
margin-top: 20px;
|
245 |
+
display: none;
|
246 |
+
}
|
247 |
+
|
248 |
+
.result-card {
|
249 |
+
background: #f8fafc;
|
250 |
+
border: 2px solid #e2e8f0;
|
251 |
+
border-radius: 12px;
|
252 |
+
padding: 20px;
|
253 |
+
margin: 15px 0;
|
254 |
+
}
|
255 |
+
|
256 |
+
.result-score {
|
257 |
+
font-size: 2rem;
|
258 |
+
font-weight: bold;
|
259 |
+
color: #10b981;
|
260 |
+
}
|
261 |
+
|
262 |
+
.footer {
|
263 |
+
text-align: center;
|
264 |
+
color: rgba(255, 255, 255, 0.8);
|
265 |
+
margin-top: 40px;
|
266 |
+
}
|
267 |
+
|
268 |
+
.footer a {
|
269 |
+
color: rgba(255, 255, 255, 0.9);
|
270 |
+
text-decoration: none;
|
271 |
+
}
|
272 |
+
|
273 |
+
.footer a:hover {
|
274 |
+
text-decoration: underline;
|
275 |
+
}
|
276 |
+
|
277 |
+
@media (max-width: 768px) {
|
278 |
+
.header h1 {
|
279 |
+
font-size: 2rem;
|
280 |
+
}
|
281 |
+
|
282 |
+
.demo-controls {
|
283 |
+
grid-template-columns: 1fr;
|
284 |
+
}
|
285 |
+
}
|
286 |
+
</style>
|
287 |
+
</head>
|
288 |
+
<body>
|
289 |
+
<div class="container">
|
290 |
+
<div class="header">
|
291 |
+
<h1>🧪 NovaEval</h1>
|
292 |
+
<p>AI Model Evaluation Platform</p>
|
293 |
+
<div class="status">Powered by Hugging Face</div>
|
294 |
+
</div>
|
295 |
+
|
296 |
+
<div class="main-content">
|
297 |
+
<div class="card">
|
298 |
+
<h3>🤗 Hugging Face Models</h3>
|
299 |
+
<p>Evaluate thousands of open-source models directly through the Hugging Face Inference API.</p>
|
300 |
+
<ul class="feature-list">
|
301 |
+
<li>No API keys required</li>
|
302 |
+
<li>Llama, Mistral, CodeLlama</li>
|
303 |
+
<li>FLAN-T5, Phi, Gemma</li>
|
304 |
+
<li>Cost-free evaluation</li>
|
305 |
+
</ul>
|
306 |
+
</div>
|
307 |
+
|
308 |
+
<div class="card">
|
309 |
+
<h3>📊 Comprehensive Evaluation</h3>
|
310 |
+
<p>Test models across popular datasets with multiple evaluation metrics.</p>
|
311 |
+
<ul class="feature-list">
|
312 |
+
<li>MMLU, HumanEval, HellaSwag</li>
|
313 |
+
<li>Accuracy, F1-Score, BLEU</li>
|
314 |
+
<li>Custom datasets supported</li>
|
315 |
+
<li>Real-time progress tracking</li>
|
316 |
+
</ul>
|
317 |
+
</div>
|
318 |
+
|
319 |
+
<div class="card">
|
320 |
+
<h3>⚡ Easy to Use</h3>
|
321 |
+
<p>Intuitive interface for researchers, developers, and AI enthusiasts.</p>
|
322 |
+
<ul class="feature-list">
|
323 |
+
<li>Step-by-step wizard</li>
|
324 |
+
<li>Interactive visualizations</li>
|
325 |
+
<li>Export results (JSON, CSV)</li>
|
326 |
+
<li>Mobile-friendly design</li>
|
327 |
+
</ul>
|
328 |
+
</div>
|
329 |
+
</div>
|
330 |
+
|
331 |
+
<div class="demo-section">
|
332 |
+
<h3>🚀 Try the Evaluation Demo</h3>
|
333 |
+
<p>Select models, datasets, and metrics to run a sample evaluation:</p>
|
334 |
+
|
335 |
+
<div class="demo-controls">
|
336 |
+
<div class="control-group">
|
337 |
+
<h4>Select Models (max 2)</h4>
|
338 |
+
<button class="model-option" data-model="microsoft/DialoGPT-medium">
|
339 |
+
DialoGPT Medium<br>
|
340 |
+
<small>Conversational AI by Microsoft</small>
|
341 |
+
</button>
|
342 |
+
<button class="model-option" data-model="google/flan-t5-base">
|
343 |
+
FLAN-T5 Base<br>
|
344 |
+
<small>Instruction-tuned by Google</small>
|
345 |
+
</button>
|
346 |
+
<button class="model-option" data-model="mistralai/Mistral-7B-Instruct-v0.1">
|
347 |
+
Mistral 7B Instruct<br>
|
348 |
+
<small>High-performance model</small>
|
349 |
+
</button>
|
350 |
+
</div>
|
351 |
+
|
352 |
+
<div class="control-group">
|
353 |
+
<h4>Select Dataset</h4>
|
354 |
+
<button class="dataset-option" data-dataset="mmlu">
|
355 |
+
MMLU<br>
|
356 |
+
<small>Multitask Language Understanding</small>
|
357 |
+
</button>
|
358 |
+
<button class="dataset-option" data-dataset="hellaswag">
|
359 |
+
HellaSwag<br>
|
360 |
+
<small>Commonsense Reasoning</small>
|
361 |
+
</button>
|
362 |
+
<button class="dataset-option" data-dataset="humaneval">
|
363 |
+
HumanEval<br>
|
364 |
+
<small>Code Generation</small>
|
365 |
+
</button>
|
366 |
+
</div>
|
367 |
+
|
368 |
+
<div class="control-group">
|
369 |
+
<h4>Select Metrics</h4>
|
370 |
+
<button class="metric-option" data-metric="accuracy">
|
371 |
+
Accuracy<br>
|
372 |
+
<small>Classification accuracy</small>
|
373 |
+
</button>
|
374 |
+
<button class="metric-option" data-metric="f1">
|
375 |
+
F1 Score<br>
|
376 |
+
<small>Balanced precision/recall</small>
|
377 |
+
</button>
|
378 |
+
<button class="metric-option" data-metric="bleu">
|
379 |
+
BLEU Score<br>
|
380 |
+
<small>Text generation quality</small>
|
381 |
+
</button>
|
382 |
+
</div>
|
383 |
+
</div>
|
384 |
+
|
385 |
+
<button class="start-btn" id="startEvaluation" disabled>
|
386 |
+
Start Evaluation Demo
|
387 |
+
</button>
|
388 |
+
</div>
|
389 |
+
|
390 |
+
<div class="progress-section" id="progressSection">
|
391 |
+
<h3>🔄 Evaluation in Progress</h3>
|
392 |
+
<p id="progressText">Initializing evaluation...</p>
|
393 |
+
<div class="progress-bar">
|
394 |
+
<div class="progress-fill" id="progressFill"></div>
|
395 |
+
</div>
|
396 |
+
<p id="progressPercent">0%</p>
|
397 |
+
</div>
|
398 |
+
|
399 |
+
<div class="results-section" id="resultsSection">
|
400 |
+
<h3>📈 Evaluation Results</h3>
|
401 |
+
<div id="resultsContainer"></div>
|
402 |
+
</div>
|
403 |
+
|
404 |
+
<div class="footer">
|
405 |
+
<p>
|
406 |
+
Powered by
|
407 |
+
<a href="https://github.com/Noveum/NovaEval" target="_blank">NovaEval</a>
|
408 |
+
and
|
409 |
+
<a href="https://huggingface.co" target="_blank">Hugging Face</a>
|
410 |
+
</p>
|
411 |
+
<p>Open Source • Community Driven • Free to Use</p>
|
412 |
+
</div>
|
413 |
+
</div>
|
414 |
+
|
415 |
+
<script>
|
416 |
+
// State management
|
417 |
+
let selectedModels = [];
|
418 |
+
let selectedDataset = null;
|
419 |
+
let selectedMetrics = [];
|
420 |
+
|
421 |
+
// DOM elements
|
422 |
+
const modelOptions = document.querySelectorAll('.model-option');
|
423 |
+
const datasetOptions = document.querySelectorAll('.dataset-option');
|
424 |
+
const metricOptions = document.querySelectorAll('.metric-option');
|
425 |
+
const startBtn = document.getElementById('startEvaluation');
|
426 |
+
const progressSection = document.getElementById('progressSection');
|
427 |
+
const resultsSection = document.getElementById('resultsSection');
|
428 |
+
const progressFill = document.getElementById('progressFill');
|
429 |
+
const progressText = document.getElementById('progressText');
|
430 |
+
const progressPercent = document.getElementById('progressPercent');
|
431 |
+
const resultsContainer = document.getElementById('resultsContainer');
|
432 |
+
|
433 |
+
// Event listeners
|
434 |
+
modelOptions.forEach(option => {
|
435 |
+
option.addEventListener('click', () => {
|
436 |
+
const model = option.dataset.model;
|
437 |
+
if (selectedModels.includes(model)) {
|
438 |
+
selectedModels = selectedModels.filter(m => m !== model);
|
439 |
+
option.classList.remove('selected');
|
440 |
+
} else if (selectedModels.length < 2) {
|
441 |
+
selectedModels.push(model);
|
442 |
+
option.classList.add('selected');
|
443 |
+
}
|
444 |
+
updateStartButton();
|
445 |
+
});
|
446 |
+
});
|
447 |
+
|
448 |
+
datasetOptions.forEach(option => {
|
449 |
+
option.addEventListener('click', () => {
|
450 |
+
datasetOptions.forEach(opt => opt.classList.remove('selected'));
|
451 |
+
option.classList.add('selected');
|
452 |
+
selectedDataset = option.dataset.dataset;
|
453 |
+
updateStartButton();
|
454 |
+
});
|
455 |
+
});
|
456 |
+
|
457 |
+
metricOptions.forEach(option => {
|
458 |
+
option.addEventListener('click', () => {
|
459 |
+
const metric = option.dataset.metric;
|
460 |
+
if (selectedMetrics.includes(metric)) {
|
461 |
+
selectedMetrics = selectedMetrics.filter(m => m !== metric);
|
462 |
+
option.classList.remove('selected');
|
463 |
+
} else {
|
464 |
+
selectedMetrics.push(metric);
|
465 |
+
option.classList.add('selected');
|
466 |
+
}
|
467 |
+
updateStartButton();
|
468 |
+
});
|
469 |
+
});
|
470 |
+
|
471 |
+
startBtn.addEventListener('click', startEvaluation);
|
472 |
+
|
473 |
+
function updateStartButton() {
|
474 |
+
const canStart = selectedModels.length > 0 && selectedDataset && selectedMetrics.length > 0;
|
475 |
+
startBtn.disabled = !canStart;
|
476 |
+
|
477 |
+
if (canStart) {
|
478 |
+
startBtn.textContent = `Evaluate ${selectedModels.length} model(s) on ${selectedDataset}`;
|
479 |
+
} else {
|
480 |
+
startBtn.textContent = 'Select models, dataset, and metrics';
|
481 |
+
}
|
482 |
+
}
|
483 |
+
|
484 |
+
function startEvaluation() {
|
485 |
+
// Hide demo section and show progress
|
486 |
+
progressSection.style.display = 'block';
|
487 |
+
resultsSection.style.display = 'none';
|
488 |
+
|
489 |
+
// Simulate evaluation progress
|
490 |
+
let progress = 0;
|
491 |
+
const steps = [
|
492 |
+
'Loading models...',
|
493 |
+
'Preparing dataset...',
|
494 |
+
'Running evaluations...',
|
495 |
+
'Computing metrics...',
|
496 |
+
'Generating results...'
|
497 |
+
];
|
498 |
+
|
499 |
+
const interval = setInterval(() => {
|
500 |
+
progress += Math.random() * 20;
|
501 |
+
if (progress > 100) progress = 100;
|
502 |
+
|
503 |
+
const stepIndex = Math.floor((progress / 100) * steps.length);
|
504 |
+
const currentStep = steps[Math.min(stepIndex, steps.length - 1)];
|
505 |
+
|
506 |
+
progressFill.style.width = progress + '%';
|
507 |
+
progressPercent.textContent = Math.round(progress) + '%';
|
508 |
+
progressText.textContent = currentStep;
|
509 |
+
|
510 |
+
if (progress >= 100) {
|
511 |
+
clearInterval(interval);
|
512 |
+
showResults();
|
513 |
+
}
|
514 |
+
}, 500);
|
515 |
+
}
|
516 |
+
|
517 |
+
function showResults() {
|
518 |
+
progressSection.style.display = 'none';
|
519 |
+
resultsSection.style.display = 'block';
|
520 |
+
|
521 |
+
// Generate mock results
|
522 |
+
const results = selectedModels.map(model => {
|
523 |
+
const modelName = model.split('/')[1] || model;
|
524 |
+
const scores = {};
|
525 |
+
|
526 |
+
selectedMetrics.forEach(metric => {
|
527 |
+
scores[metric] = (Math.random() * 0.3 + 0.7).toFixed(3); // 70-100%
|
528 |
+
});
|
529 |
+
|
530 |
+
return { model: modelName, scores };
|
531 |
+
});
|
532 |
+
|
533 |
+
// Display results
|
534 |
+
resultsContainer.innerHTML = results.map(result => `
|
535 |
+
<div class="result-card">
|
536 |
+
<h4>${result.model}</h4>
|
537 |
+
${Object.entries(result.scores).map(([metric, score]) => `
|
538 |
+
<div style="display: flex; justify-content: space-between; margin: 10px 0;">
|
539 |
+
<span>${metric.toUpperCase()}:</span>
|
540 |
+
<span class="result-score">${(score * 100).toFixed(1)}%</span>
|
541 |
+
</div>
|
542 |
+
`).join('')}
|
543 |
+
</div>
|
544 |
+
`).join('');
|
545 |
+
}
|
546 |
+
|
547 |
+
// Initialize
|
548 |
+
updateStartButton();
|
549 |
+
</script>
|
550 |
+
</body>
|
551 |
+
</html>
|
552 |
+
"""
|
553 |
+
|
554 |
+
@app.get("/", response_class=HTMLResponse)
|
555 |
+
async def serve_index():
|
556 |
+
"""Serve the main application"""
|
557 |
+
return HTMLResponse(content=HTML_CONTENT)
|
558 |
+
|
559 |
+
@app.get("/api/health")
|
560 |
+
async def health_check():
|
561 |
+
"""Health check endpoint"""
|
562 |
+
return {"status": "healthy", "service": "novaeval-space", "version": "1.0.0"}
|
563 |
+
|
564 |
+
if __name__ == "__main__":
|
565 |
+
port = int(os.getenv("PORT", 7860))
|
566 |
+
logger.info(f"Starting NovaEval Space on port {port}")
|
567 |
+
uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)
|
568 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
fastapi>=0.104.0
|
2 |
+
uvicorn[standard]>=0.24.0
|
3 |
+
|