import csv import json import locale import os import sys from typing import Dict, Union import pandas as pd model_details = { "DeepSeek R1-0528": ( "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", 685, "General", "V2", ), "DeepSeek R1": ( "https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General", "V1", ), "Llama 3.1 405B": ( "https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8", 406, "General", "V1", ), "Qwen3 236B A22B": ( "https://huggingface.co/Qwen/Qwen3-235B-A22B", 235, "General", "V2", ), "Llama 3.(1-3) 70B": ( "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", 70.6, "General", "V1", ), "Qwen2.5 72B": ( "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", 72.7, "General", "V1", ), "QwQ 32B": ("https://huggingface.co/Qwen/QwQ-32B", 32.8, "General", "V2"), "Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General", "V1"), "StarChat2 15B v0.1": ( "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1", 16, "General", "V1", ), "DeepSeek R1 Distill Qwen 14B": ( "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", 14.8, "General", "V1", ), "CodeLlama 70B": ( "https://huggingface.co/codellama/CodeLlama-70b-hf", 69, "Coding", "V1", ), "QwenCoder 2.5 32B": ( "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct", 32.5, "Coding", "V1", ), "DeepSeek Coder 33B": ( "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct", 33.3, "Coding", "V1", ), "QwenCoder 2.5 14B": ( "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct", 14.7, "Coding", "V1", ), "DeepCoder 14B": ( "https://huggingface.co/agentica-org/DeepCoder-14B-Preview", 14.8, "Coding", "V2", ), "OpenCoder 8B": ( "https://huggingface.co/infly/OpenCoder-8B-Instruct", 7.77, "Coding", "V1", ), "SeedCoder 8B": ( "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct", 8.25, "Coding", "V2", ), "SeedCoder 8B Reasoning": ( "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16", 8.25, "Coding", "V2", ), "QwenCoder 2.5 7B": ( "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct", 7.61, "Coding", "V1", ), "DeepSeek Coder 6.7B": ( "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct", 6.74, "Coding", "V1", ), "HaVen-CodeQwen": ( "https://huggingface.co/yangyiyao/HaVen-CodeQwen", 7.25, "RTL-Specific", "V1", ), "CodeV R1 Distill Qwen 7B": ( "https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B", 7.62, "RTL-Specific", "V2", ), "CodeV-CL-7B": ( "https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific", "V1", ), "CodeV-QW-7B": ( "https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific", "V1", ), "CodeV-DS-6.7B": ( "https://huggingface.co/yang-z/CodeV-DS-6.7B", 6.74, "RTL-Specific", "V1", ), "RTLCoder Mistral": ( "https://huggingface.co/ishorn5/RTLCoder-v1.1", 7.24, "RTL-Specific", "V1", ), "RTLCoder DeepSeek": ( "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1", 6.74, "RTL-Specific", "V1", ), "OriGen": ("https://huggingface.co/henryen/OriGen", 6.74, "RTL-Specific", "V1"), "Qwen3 Coder 480B A35B": ( "https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct", 480, "Coding", "V2", ), "Magistral Small 2506": ( "https://huggingface.co/mistralai/Magistral-Small-2506", 23.6, "General", "V2", ), } def get_headers(reader, agg=False) -> Union[list, list]: metrics, benchs = [], [] for i, row in enumerate(reader): if i == 0: metrics = row[1:] elif i == 1 and not agg: benchs = row[1:] break else: return metrics return metrics, benchs def get_model_params_and_url(model) -> Union[str, str, float, str]: if model not in model_details: return "-", 0.0, "Unknown", "Unknown" url = model_details[model][0] params = model_details[model][1] type = model_details[model][2] release = model_details[model][3] return url, params, type, release def parse_results(csv_path: str) -> list[dict]: """ Each row has the following format: MODEL | BENCHMARK | TASK | METRIC | RESULT """ dataset = [] models = [] with open(csv_path, newline="") as csvfile: reader = csv.reader(csvfile, delimiter=",") metrics, benchs = get_headers(reader) for i, row in enumerate(reader): if not row or all(not cell.strip() for cell in row): continue model = row[0] if not model: continue url, params, type, release = get_model_params_and_url(model) models.append(model) row = row[1:] ctr = 0 for metric, bench in zip(metrics, benchs): if metric == "EM": metric = "Exact Matching (EM)" record = {} record["Model"] = model record["Model Type"] = type record["Benchmark"] = bench record["Task"] = metric record["Result"] = float(row[ctr].replace(",", ".")) record["Model URL"] = url record["Params"] = params record["Release"] = release dataset.append(record) ctr += 1 print(models) return dataset def parse_agg(csv_path: str = "results/aggregated_scores_icarus.csv") -> pd.DataFrame: """ Each row has the following format: MODEL | BENCHMARK | TASK | METRIC | RESULT """ return pd.read_csv(csv_path) def writeJson(data: list, path: str): with open(path, "w") as f: json.dump(data, f, indent=4, ensure_ascii=False) print("Done") def read_json(json_path: str = "results/results_icarus.json"): with open(json_path, "r", encoding="utf-8") as file: data = json.load(file) return data def read_data( json_path: str = "results/results_icarus.json", ) -> tuple[pd.DataFrame, list, list, str]: data = read_json(json_path) df = pd.DataFrame(data) df.rename( columns={ "Model": "Model", "Benchmark": "Benchmark", "Task": "Metric", "Result": "Score", "EM": "Exact Matching (EM)", }, inplace=True, ) df["Params"] = pd.to_numeric(df["Params"], errors="coerce") benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True) metrics = df["Metric"].unique().tolist() default_metric = ( "Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0] ) return df, benchmarks, metrics, default_metric if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python results/parse.py ") sys.exit(1) csv_path = sys.argv[1] if not os.path.exists(csv_path): print(f"Error: File not found at {csv_path}") sys.exit(1) json_path = os.path.splitext(csv_path)[0] + ".json" print(f"Parsing {csv_path}...") parsed_data = parse_results(csv_path) writeJson(parsed_data, json_path)