Spaces:

bigcode
/

bigcodebench-evaluator

Running on CPU Upgrade

App Files Files Community

terryyz commited on Mar 2, 2025

Commit

f175fac

verified ·

1 Parent(s): 7836cdd

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -296

app.py CHANGED Viewed

@@ -1,296 +0,0 @@
-import gradio as gr
-import json
-import logging
-import multiprocessing
-import os
-import pickle
-import threading
-import time
-from collections import Counter, defaultdict
-from concurrent.futures import ProcessPoolExecutor, as_completed, wait, FIRST_COMPLETED
-from datetime import datetime
-from typing import Any, Dict, List, Tuple
-from warnings import warn
-import gc
-import numpy as np
-from huggingface_hub import HfApi
-from bigcodebench.data import get_bigcodebench, get_bigcodebench_hash, load_solutions
-from bigcodebench.data.utils import CACHE_DIR
-from bigcodebench.eval import PASS, compatible_eval_result, estimate_pass_at_k, untrusted_check
-from bigcodebench.gen.util import trusted_check
-from apscheduler.schedulers.background import BackgroundScheduler
-REPO_ID = "bigcode/bigcodebench-evaluator"
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
-API = HfApi(token=HF_TOKEN)
-Result = Tuple[str, List[bool]]
-def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit):
-    cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
-    if os.path.exists(cache_file):
-        with open(cache_file, "rb") as f:
-            return pickle.load(f)
-    os.makedirs(CACHE_DIR, exist_ok=True)
-    tbegin = time.time()
-    with ProcessPoolExecutor(max_workers=n_workers) as executor:
-        futures = []
-        n_samples = 0
-        expected_time = dict()
-        for problem in problems.values():
-            args = (
-                problem["complete_prompt"] + "\n" + problem["canonical_solution"],
-                problem["test"],
-                problem["task_id"],
-                max_as_limit,
-                max_data_limit,
-                max_stack_limit,
-                min_time_limit,
-            )
-            futures.append(executor.submit(trusted_check, *args))
-            n_samples += 1
-        for future in as_completed(futures):
-            result = future.result()
-            expected_time[result["task_id"]] = result["time"]
-    if any(expected_time.values()):
-        with open(cache_file, "wb") as f:
-            pickle.dump(expected_time, f)
-    return expected_time
-def check_correctness(
-    completion_id: int,
-    problem: Dict[str, Any],
-    solution: str,
-    max_as_limit: float,
-    max_data_limit: float,
-    max_stack_limit: float,
-    identifier=None,
-    min_time_limit: float = 0.1,
-    gt_time_limit: float = 2.0,
-) -> Dict[str, Result]:
-    ret = {
-        "completion_id": completion_id,
-        "task_id": problem["task_id"],
-        "_identifier": identifier,
-        "solution": solution,
-    }
-    ret["base"] = untrusted_check(
-        solution,
-        problem["test"],
-        problem["entry_point"],
-        max_as_limit,
-        max_data_limit,
-        max_stack_limit,
-        min_time_limit,
-        gt_time_limit,
-    )
-    return ret
-def evaluate(
-    split: str,
-    subset: str,
-    samples: str,
-    pass_k: str="1,5,10",
-    parallel: int = -1,
-    min_time_limit: float = 1,
-    max_as_limit: int = 30 * 1024,
-    max_data_limit: int = 30 * 1024,
-    max_stack_limit: int = 10,
-    calibrated: bool = True,
-    check_gt_only: bool = False,
-    no_gt: bool = False,
-    selective_evaluate: str = "",
-):
-    passk = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()]
-    if parallel < 1:
-        n_workers = max(1, multiprocessing.cpu_count() // 2)
-    else:
-        n_workers = parallel
-    if check_gt_only:
-        samples = "__dummy__.jsonl"
-    extra = subset + "_" if subset != "full" else ""
-    problems = get_bigcodebench(subset=subset)
-    # Add selective evaluation logic
-    if selective_evaluate:
-        selected_ids = ["BigCodeBench/" + id for id in sorted(set(selective_evaluate.split(",")))]
-        problems = {k: v for k, v in problems.items() if k in selected_ids}
-        if not problems:
-            raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset")
-    dataset_hash = get_bigcodebench_hash(subset=subset)
-    if not no_gt:
-        expected_time = get_groundtruth(n_workers, problems, dataset_hash, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit)
-    else:
-        expected_time = {task_id: None for task_id in problems}
-    gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
-    failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
-    pass_at_k = dict()
-    results = {
-        "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
-        "eval": {},
-    }
-    if not check_gt_only:
-        with ProcessPoolExecutor(max_workers=n_workers) as executor:
-            futures = []
-            completion_id = Counter()
-            n_samples = 0
-            eval_results = defaultdict(list)  # task_id ->
-            remainings = set()
-            for sample in load_solutions(samples):
-                task_id = sample["task_id"]
-                if task_id not in problems:
-                    continue
-                solution = (
-                    sample["solution"]
-                    if "solution" in sample
-                    else problems[task_id]["complete_prompt"] + sample["completion"]
-                )
-                if calibrated:
-                    solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
-                remainings.add(sample["_identifier"])
-                args = (
-                    completion_id[task_id],
-                    problems[task_id],
-                    solution,
-                    max_as_limit,
-                    max_data_limit,
-                    max_stack_limit,
-                    sample["_identifier"],
-                    min_time_limit,
-                    expected_time[task_id] if expected_time[task_id] else 20
-                )
-                futures.append(executor.submit(check_correctness, *args))
-                completion_id[task_id] += 1
-                n_samples += 1
-            assert n_samples == len(remainings), "Missing problems in unfinished"
-            assert len(completion_id) == len(problems), "Missing problems in samples"
-            for future in as_completed(futures):
-                result = future.result()
-                remainings.remove(result["_identifier"])
-                eval_results[result["task_id"]].append(result)
-                del future, result
-                gc.collect()
-        # sort the results for each problem by completion_id
-        for task_id, task_results in eval_results.items():
-            task_results.sort(key=lambda x: x["completion_id"])
-            results["eval"][task_id] = []
-            for res in task_results:
-                stat, details = res["base"]
-                results["eval"][task_id].append(
-                    {
-                        "task_id": task_id,
-                        "solution": res["solution"],
-                        "status": stat,
-                        "details": details,
-                    }
-                )
-        # Calculate pass@k.
-        total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
-        base_correct = []
-        for key, res in results["eval"].items():
-            if key not in problems:
-                continue
-            bc = sum([r["status"] == PASS for r in res])
-            base_correct.append(bc)
-        base_correct = np.array(base_correct)
-        pass_at_k.update({
-            f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
-            for k in passk
-            if total.min() >= k
-        })
-        del problems, futures
-        gc.collect()
-    pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
-    pass_at_k["split"] = split
-    pass_at_k["subset"] = subset
-    pass_at_k["calibrated"] = calibrated
-    pass_at_k["gt_pass_rate"] = gt_pass_rate
-    pass_at_k["failed_tasks"] = failed_tasks
-    return results, pass_at_k
-# def run_gradio():
-interface = gr.Interface(
-    fn=evaluate,
-    inputs=[
-        gr.Dropdown(["complete", "instruct"], label="BigCodeBench Split"),
-        gr.Dropdown(["full", "hard"], label="BigCodeBench Subset"),
-        gr.File(label="Samples Path (.jsonl)"),
-        gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
-        gr.Slider(-1, multiprocessing.cpu_count(), step=1, label="Parallel Workers", value=-1),
-        gr.Slider(0.1, 10, step=0.1, label="Min Time Limit", value=1),
-        gr.Slider(1, 100 * 1024, step=1024, label="Max AS Limit", value=30 * 1024),
-        gr.Slider(1, 100 * 1024, step=1024, label="Max Data Limit", value=30 * 1024),
-        gr.Slider(1, 100, step=1, label="Max Stack Limit", value=10),
-        gr.Checkbox(label="Calibrated", value=True),
-        gr.Checkbox(label="Check GT Only"),
-        gr.Checkbox(label="No GT"),
-        gr.Textbox(label="Selective Evaluated Task IDs (comma-separated, e.g. '0,1,2')", value=""),
-    ],
-    outputs=[
-        gr.JSON(label="Results"),
-        gr.JSON(label="Eval Results"),
-    ],
-    # concurrency_limit=None
-)
-interface.queue(default_concurrency_limit=None)
-def preload_gt():
-    evaluate(split="complete", subset="full", samples="", check_gt_only=True)
-    evaluate(split="complete", subset="hard", samples="", check_gt_only=True)
-def restart_space():
-    logging.info(f"Restarting space with repo ID: {REPO_ID}")
-    try:
-        # Now restart the space
-        API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
-        logging.info("Space restarted successfully.")
-    except Exception as e:
-        logging.error(f"Failed to restart space: {e}")
-# if __name__ == "__main__":
-while True:
-    try:
-        preload_gt()
-        break
-    except:
-        continue
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", hours=1)  # Restart every 2hs
-scheduler.start()
-interface.launch(show_error=True)