File size: 2,208 Bytes
ea047ad
 
fdfafe5
9562cba
ea047ad
4e21b7f
ea047ad
fdfafe5
 
 
ea047ad
fdfafe5
67741f2
ea047ad
3adea5e
 
9562cba
 
ea047ad
fdfafe5
64a657c
67741f2
2d54755
67741f2
0569297
448c286
0569297
 
 
67741f2
 
ea047ad
 
67741f2
64a657c
3adea5e
67741f2
 
 
ea047ad
 
 
 
3adea5e
ea047ad
3adea5e
ea047ad
3adea5e
ea047ad
 
 
 
 
 
67741f2
2d54755
67741f2
 
4e21b7f
 
 
 
 
67741f2
 
ea047ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import asyncio
import subprocess
from pathlib import Path

from loguru import logger

from yourbench_space.leaderboard_space.env import INIT_MODELS


ON_SPACES = os.environ.get("system") == "spaces"
OUTPUT_DIR = "/data" if ON_SPACES else "."  # TODO: fix the space folder


def create_eval_file(eval_ds_name: str):
    task_name = eval_ds_name.replace("/", "_")
    template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
    subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])


async def run_process(args: list, custom_env=None) -> dict:
    process = await asyncio.create_subprocess_exec(
        *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, env=custom_env
    )
    try:
        await asyncio.wait_for(process.wait(), timeout=350)
    except TimeoutError:
        logger.error("Lighteval process Timed Out")

    stdout = await process.stdout.read()
    stderr = await process.stderr.read()
    return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}


async def run_evaluations(eval_ds_name: str, org: str, custom_env=None) -> list:
    task_name = eval_ds_name.replace("/", "_")
    tasks = []
    for model_name, provider in INIT_MODELS:
        args = [
            "lighteval",
            "endpoint",
            "inference-providers",
            f"model={model_name},provider={provider}",
            f"custom|{task_name}|0|0",
            "--custom-tasks",
            f"custom_{task_name}_task.py",
            "--max-samples",
            "30",
            "--output-dir",
            f"{OUTPUT_DIR}",
            "--save-details",
            "--results-org",
            org,
            "--push-to-hub",
        ]
        tasks.append(run_process(args, custom_env))
    # Will capture the task if failed
    processes = await asyncio.gather(*tasks, return_exceptions=True)
    for process in processes:
        logger.info("Logs for process:")
        logger.info(process["stdout"])
        logger.info(process["stderr"])

    if all(not isinstance(result, Exception) for result in processes):
        return "✅"
    return "At least one model failed"