Fix git conflict
Browse files- app.py +11 -3
- src/display/about.py +46 -5
- src/display/utils.py +52 -22
- src/populate.py +3 -2
app.py
CHANGED
|
@@ -306,9 +306,17 @@ with demo:
|
|
| 306 |
with gr.Row():
|
| 307 |
with gr.Column():
|
| 308 |
model_name_textbox = gr.Textbox(label="Model name")
|
| 309 |
-
precision = gr.Radio(["bfloat16", "float16", "4bit"], label="Precision", info="What precision are you using for inference?")
|
| 310 |
-
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
file_output = gr.File()
|
| 313 |
upload_button = gr.UploadButton("Upload json", file_types=['.json'])
|
| 314 |
upload_button.upload(validate_upload, upload_button, file_output)
|
|
|
|
| 306 |
with gr.Row():
|
| 307 |
with gr.Column():
|
| 308 |
model_name_textbox = gr.Textbox(label="Model name")
|
| 309 |
+
#precision = gr.Radio(["bfloat16", "float16", "4bit"], label="Precision", info="What precision are you using for inference?")
|
| 310 |
+
precision = gr.Dropdown(
|
| 311 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 312 |
+
label="Precision",
|
| 313 |
+
multiselect=False,
|
| 314 |
+
value="other",
|
| 315 |
+
interactive=True,
|
| 316 |
+
info="What weight precision were you using during the evaluation?"
|
| 317 |
+
)
|
| 318 |
+
hf_model_id = gr.Textbox(label="Model link (Optional)", info="URL to the model's Hugging Face repository, or it's official website")
|
| 319 |
+
contact_email = gr.Textbox(label="Your E-Mail")
|
| 320 |
file_output = gr.File()
|
| 321 |
upload_button = gr.UploadButton("Upload json", file_types=['.json'])
|
| 322 |
upload_button.upload(validate_upload, upload_button, file_output)
|
src/display/about.py
CHANGED
|
@@ -31,6 +31,8 @@ class Tasks(Enum):
|
|
| 31 |
# Your leaderboard name
|
| 32 |
TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
|
| 33 |
|
|
|
|
|
|
|
| 34 |
# What does your leaderboard evaluate?
|
| 35 |
INTRODUCTION_TEXT = """
|
| 36 |
Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
|
|
@@ -38,20 +40,55 @@ Czech-Bench is a collection of LLM benchmarks available for the Czech language.
|
|
| 38 |
Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
|
| 39 |
"""
|
| 40 |
|
| 41 |
-
TABLE_DESC = "The values presented in the table represent the accuracy metric."
|
| 42 |
-
|
| 43 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 44 |
LLM_BENCHMARKS_TEXT = f"""
|
| 45 |
## Basic Information
|
| 46 |
-
The goal of
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
Key Features and Benefits:
|
| 49 |
- **Tailored for the Czech Language:** The benchmark includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
|
| 50 |
- **Wide Range of Tasks:** It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
|
|
|
|
| 51 |
- **Ease of Use:** The benchmark is designed to be easily integrated into your development process, saving time and resources during model testing and improvement.
|
| 52 |
- **Up-to-date and Relevant:** We regularly update our datasets to reflect the latest findings and trends in language model development.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
## Evaluation Process
|
| 57 |
|
|
@@ -79,10 +116,14 @@ lm_eval --model hf \\
|
|
| 79 |
--output_path $OUTPUT_PATH \\
|
| 80 |
--apply_chat_template \\
|
| 81 |
```
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
### 3. Upload results to Leaderboard
|
| 85 |
-
|
|
|
|
| 86 |
|
| 87 |
"""
|
| 88 |
|
|
|
|
| 31 |
# Your leaderboard name
|
| 32 |
TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
|
| 33 |
|
| 34 |
+
TABLE_DESC = "The values presented in the table represent the accuracy metric."
|
| 35 |
+
|
| 36 |
# What does your leaderboard evaluate?
|
| 37 |
INTRODUCTION_TEXT = """
|
| 38 |
Czech-Bench is a collection of LLM benchmarks available for the Czech language. It currently consists of 15 Czech benchmarks, including new machine translations of the popular ARC, GSM8K, MMLU, and TruthfulQA datasets.
|
|
|
|
| 40 |
Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
|
| 41 |
"""
|
| 42 |
|
|
|
|
|
|
|
| 43 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 44 |
LLM_BENCHMARKS_TEXT = f"""
|
| 45 |
## Basic Information
|
| 46 |
+
The goal of the CzechBench project is to provide a comprehensive and practical benchmark for evaluating Czech language models.
|
| 47 |
+
Our [evaluation suite](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
|
| 48 |
+
currently consists of 15 individual tasks, leveraging pre-existing Czech datasets together with new machine translations of popular LLM benchmarks,
|
| 49 |
+
including ARC, GSM8K, MMLU, and TruthfulQA.
|
| 50 |
|
| 51 |
Key Features and Benefits:
|
| 52 |
- **Tailored for the Czech Language:** The benchmark includes both original Czech datasets and adapted versions of international datasets, ensuring relevant evaluation of model performance in the Czech context.
|
| 53 |
- **Wide Range of Tasks:** It contains 15 different tasks that cover various aspects of language understanding and text generation, enabling a comprehensive assessment of the model's capabilities.
|
| 54 |
+
- **Universal model support:** The universal text-to-text evaluation approach adopted in CzechBench allows for direct comparison of models with varying levels of internal access, including commercial APIs.
|
| 55 |
- **Ease of Use:** The benchmark is designed to be easily integrated into your development process, saving time and resources during model testing and improvement.
|
| 56 |
- **Up-to-date and Relevant:** We regularly update our datasets to reflect the latest findings and trends in language model development.
|
| 57 |
+
By using CzechBench, you will gain deep insights into the strengths and weaknesses of your models, allowing you to better focus on key areas for optimization.
|
| 58 |
+
This will not only improve the performance of your models but also enhance their real-world deployment in various Czech contexts.
|
| 59 |
+
|
| 60 |
+
Below, you can find the up-to-date loaderboard of models evaluated on CzechBench.
|
| 61 |
+
For more information on the included benchmarks and instructions on evaluating your own models, please visit the "About" section below.
|
| 62 |
+
|
| 63 |
+
"""
|
| 64 |
+
# Czech-Bench is developed by <a href="https://huggingface.co/CIIRC-NLP">CIIRC-NLP</a>.
|
| 65 |
+
|
| 66 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
| 67 |
+
LLM_BENCHMARKS_TEXT = f"""
|
| 68 |
+
## Basic Information
|
| 69 |
|
| 70 |
+
The CzechBench evaluation suite is hosted on [GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme).
|
| 71 |
+
It is implemented on top of the popular [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) framework, which provides extensive model compatibility and optimal evaluation efficiency.
|
| 72 |
+
|
| 73 |
+
All currently supported benchmarks are listed in the table below:
|
| 74 |
+
|
| 75 |
+
| Dataset | Language | Task type | Metrics | Samples | Task ID |
|
| 76 |
+
| ------------------------------------------------------------ | ----------------------------- | -------------------------- | -------------- | ------: | --------------- |
|
| 77 |
+
| [AGREE](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/agree_cs) | CS (Original) | Subject-verb agreement | Acc | 627 | agree_cs |
|
| 78 |
+
| [ANLI](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/anli_cs) | CS (Translated) | Natural Language Inference | Acc, Macro F1 | 1200 | anli_cs |
|
| 79 |
+
| [ARC Challenge](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs) | CS (Translated) | Knowledge-Based QA | Acc | 1172 | arc_cs |
|
| 80 |
+
| [ARC Easy](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/arc_cs) | CS (Translated) | Knowledge-Based QA | Acc | 2376 | arc_cs |
|
| 81 |
+
| [Belebele](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/belebele_cs) | CS (Professional translation) | Reading Comprehension / QA | Acc | 895 | belebele_cs |
|
| 82 |
+
| [CTKFacts](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/ctkfacts_cs) | CS (Original) | Natural Language Inference | Acc, Macro F1 | 558 | ctkfacts_cs |
|
| 83 |
+
| [Czech News](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/czechnews_cs) | CS (Original) | News Topic Classification | Acc, Macro F1 | 1000 | czechnews_cs |
|
| 84 |
+
| [Facebook Comments](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/fb_comments_cs) | CS (Original) | Sentiment Analysis | Acc, Macro F1 | 1000 | fb_comments_cs |
|
| 85 |
+
| [GSM8K](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/gsm8k_cs) | CS (Translated) | Mathematical inference | EM Acc | 1319 | gsm8k_cs |
|
| 86 |
+
| [Klokánek](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/klokanek_cs) | CS (Original) | Math/Logical Inference | Acc | 808 | klokanek_cs |
|
| 87 |
+
| [Mall Reviews](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mall_reviews_cs) | CS (Original) | Sentiment Analysis | Acc, Macro F1 | 3000 | mall_reviews_cs |
|
| 88 |
+
| [MMLU](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/mmlu_cs) | CS (Translated) | Knowledge-Based QA | Acc | 12408 | mmlu_cs |
|
| 89 |
+
| [SQAD](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/sqad_cs) | CS (Original) | Reading Comprehension / QA | EM Acc, BoW F1 | 843 | sqad_cs |
|
| 90 |
+
| [Subjectivity](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/subjectivity_cs) | CS (Original) | Subjectivity Analysis | Acc, Macro F1 | 2000 | subjectivity_cs |
|
| 91 |
+
| [TruthfulQA](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/truthfulqa_cs) | CS (Translated) | Knowledge-Based QA | Acc | 813 | truthfulqa_cs |
|
| 92 |
|
| 93 |
## Evaluation Process
|
| 94 |
|
|
|
|
| 116 |
--output_path $OUTPUT_PATH \\
|
| 117 |
--apply_chat_template \\
|
| 118 |
```
|
| 119 |
+
|
| 120 |
+
For advanced usage instructions, please inspect the [CzechBench README on GitHub](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench#readme)
|
| 121 |
+
or the official [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) documentation.
|
| 122 |
|
| 123 |
|
| 124 |
### 3. Upload results to Leaderboard
|
| 125 |
+
Inside the `$OUTPUT_PATH` directory, you can find the file `results.json`.
|
| 126 |
+
To submit your evaluation results to our leaderboard, please visit the "Submit here!" section above and upload your `results.json` file.
|
| 127 |
|
| 128 |
"""
|
| 129 |
|
src/display/utils.py
CHANGED
|
@@ -47,30 +47,53 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
|
|
| 47 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
| 48 |
"""
|
| 49 |
|
| 50 |
-
|
|
|
|
| 51 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
| 52 |
-
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("
|
| 53 |
-
auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("
|
| 54 |
-
auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("
|
| 55 |
-
auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("
|
| 56 |
-
auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("
|
| 57 |
-
auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("
|
| 58 |
-
auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("
|
| 59 |
-
auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("
|
| 60 |
-
auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("
|
| 61 |
-
auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("
|
| 62 |
-
auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("
|
| 63 |
-
auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("
|
| 64 |
-
auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("
|
| 65 |
-
auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("
|
| 66 |
-
auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("
|
| 67 |
-
auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("
|
| 68 |
|
| 69 |
|
| 70 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 71 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 72 |
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
## For the queue columns in the submission tab
|
| 75 |
@dataclass(frozen=True)
|
| 76 |
class EvalQueueColumn: # Queue column
|
|
@@ -120,6 +143,9 @@ class WeightType(Enum):
|
|
| 120 |
|
| 121 |
|
| 122 |
class Precision(Enum):
|
|
|
|
|
|
|
|
|
|
| 123 |
float16 = ModelDetails("float16")
|
| 124 |
bfloat16 = ModelDetails("bfloat16")
|
| 125 |
qt_8bit = ModelDetails("8bit")
|
|
@@ -128,17 +154,21 @@ class Precision(Enum):
|
|
| 128 |
Unknown = ModelDetails("?")
|
| 129 |
|
| 130 |
def from_str(precision):
|
| 131 |
-
if precision in ["torch.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
return Precision.float16
|
| 133 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
| 134 |
return Precision.bfloat16
|
| 135 |
-
if precision in ["8bit"]:
|
| 136 |
return Precision.qt_8bit
|
| 137 |
-
if precision in ["4bit"]:
|
| 138 |
return Precision.qt_4bit
|
| 139 |
if precision in ["GPTQ", "None"]:
|
| 140 |
return Precision.qt_GPTQ
|
| 141 |
-
return Precision.
|
| 142 |
|
| 143 |
|
| 144 |
# Column selection
|
|
@@ -150,7 +180,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
| 150 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 151 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 152 |
|
| 153 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 154 |
|
| 155 |
NUMERIC_INTERVALS = {
|
| 156 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
|
| 47 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
| 48 |
"""
|
| 49 |
|
| 50 |
+
|
| 51 |
+
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 52 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
| 53 |
+
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model link (temporary)", "str", True)])
|
| 54 |
+
auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("AGREE", "number", True)])
|
| 55 |
+
auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("ANLI", "number", True)])
|
| 56 |
+
auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("ARC-Challenge", "number", True)])
|
| 57 |
+
auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("ARC-Easy", "number", True)])
|
| 58 |
+
auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("Belebele", "number", True)])
|
| 59 |
+
auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("CTKFacts", "number", True)])
|
| 60 |
+
auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("Czech News", "number", True)])
|
| 61 |
+
auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("Facebook Comments", "number", True)])
|
| 62 |
+
auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("GSM8K", "number", True)])
|
| 63 |
+
auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("Klokanek", "number", True)])
|
| 64 |
+
auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("Mall Reviews", "number", True)])
|
| 65 |
+
auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("MMLU", "number", True)])
|
| 66 |
+
auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("SQAD", "number", True)])
|
| 67 |
+
auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("Subjectivity", "number", True)])
|
| 68 |
+
auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("TruthfulQA", "number", True)])
|
| 69 |
|
| 70 |
|
| 71 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 72 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 73 |
|
| 74 |
|
| 75 |
+
HEADER_MAP = {
|
| 76 |
+
"eval_name": "Model",
|
| 77 |
+
"precision": "Precision",
|
| 78 |
+
"hf_model_id": "Model link (temporary)",
|
| 79 |
+
"agree_cs": "AGREE",
|
| 80 |
+
"anli_cs": "ANLI",
|
| 81 |
+
"arc_challenge_cs": "ARC-Challenge",
|
| 82 |
+
"arc_easy_cs": "ARC-Easy",
|
| 83 |
+
"belebele_cs": "Belebele",
|
| 84 |
+
"ctkfacts_cs": "CTKFacts",
|
| 85 |
+
"czechnews_cs": "Czech News",
|
| 86 |
+
"fb_comments_cs": "Facebook Comments",
|
| 87 |
+
"gsm8k_cs": "GSM8K",
|
| 88 |
+
"klokanek_cs": "Klokanek",
|
| 89 |
+
"mall_reviews_cs": "Mall Reviews",
|
| 90 |
+
"mmlu_cs": "MMLU",
|
| 91 |
+
"sqad_cs": "SQAD",
|
| 92 |
+
"subjectivity_cs": "Subjectivity",
|
| 93 |
+
"truthfulqa_cs": "TruthfulQA",
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
## For the queue columns in the submission tab
|
| 98 |
@dataclass(frozen=True)
|
| 99 |
class EvalQueueColumn: # Queue column
|
|
|
|
| 143 |
|
| 144 |
|
| 145 |
class Precision(Enum):
|
| 146 |
+
other = ModelDetails("other")
|
| 147 |
+
float64 = ModelDetails("float64")
|
| 148 |
+
float32 = ModelDetails("float32")
|
| 149 |
float16 = ModelDetails("float16")
|
| 150 |
bfloat16 = ModelDetails("bfloat16")
|
| 151 |
qt_8bit = ModelDetails("8bit")
|
|
|
|
| 154 |
Unknown = ModelDetails("?")
|
| 155 |
|
| 156 |
def from_str(precision):
|
| 157 |
+
if precision in ["torch.float64", "torch.double" ,"float64"]:
|
| 158 |
+
return Precision.float64
|
| 159 |
+
if precision in ["torch.float32", "torch.float" ,"float32"]:
|
| 160 |
+
return Precision.tfloat32
|
| 161 |
+
if precision in ["torch.float16", "torch.half", "float16"]:
|
| 162 |
return Precision.float16
|
| 163 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
| 164 |
return Precision.bfloat16
|
| 165 |
+
if precision in ["8bit", "int8"]:
|
| 166 |
return Precision.qt_8bit
|
| 167 |
+
if precision in ["4bit", "int4"]:
|
| 168 |
return Precision.qt_4bit
|
| 169 |
if precision in ["GPTQ", "None"]:
|
| 170 |
return Precision.qt_GPTQ
|
| 171 |
+
return Precision.other
|
| 172 |
|
| 173 |
|
| 174 |
# Column selection
|
|
|
|
| 180 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 181 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 182 |
|
| 183 |
+
BENCHMARK_COLS = [HEADER_MAP[t.value.col_name] for t in Tasks]
|
| 184 |
|
| 185 |
NUMERIC_INTERVALS = {
|
| 186 |
"?": pd.Interval(-1, 0, closed="right"),
|
src/populate.py
CHANGED
|
@@ -4,7 +4,8 @@ import numpy as np
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
|
| 7 |
-
from src.display.
|
|
|
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
| 10 |
|
|
@@ -13,7 +14,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 13 |
#all_data_json = [v.to_dict() for v in raw_data]
|
| 14 |
df = pd.DataFrame.from_records(raw_data)
|
| 15 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 16 |
-
df = df.rename(columns=
|
| 17 |
df = df[cols].round(decimals=2)
|
| 18 |
df.replace(r'\s+', np.nan, regex=True)
|
| 19 |
# filter out if any of the benchmarks have not been produced
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
|
| 7 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 8 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, HEADER_MAP
|
| 9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 10 |
|
| 11 |
|
|
|
|
| 14 |
#all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
df = pd.DataFrame.from_records(raw_data)
|
| 16 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 17 |
+
df = df.rename(columns=HEADER_MAP)
|
| 18 |
df = df[cols].round(decimals=2)
|
| 19 |
df.replace(r'\s+', np.nan, regex=True)
|
| 20 |
# filter out if any of the benchmarks have not been produced
|