Krisseck commited on
Commit
8163dc5
β€’
0 Parent(s):

Initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. IFEval.csv +5 -0
  3. README.md +13 -0
  4. app.py +139 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv
2
+ .idea
IFEval.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Model,Parameters (B),Repo,Quantization,Final Score,Strict Prompt Score,Strict Inst Score,Loose Prompt Score,Loose Inst Score,Link
2
+ Llama 3 8B,8,failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF,Q8_0,0.7589,0.7001,0.7818,0.7394,0.8141,https://huggingface.co/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF
3
+ Llama 3 8B,8,MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF,Q8_0,0.7366,0.6765,0.7614,0.7172,0.7914,https://huggingface.co/MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF
4
+ Mistral 7B v0.3,7,MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF,Q8_0,0.5689,0.4972,0.5983,0.5397,0.6403,https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF
5
+ Phi 3 Medium 4K,14,bartowski/Phi-3-medium-4k-instruct-GGUF,Q8_0,0.5689,0.4972,0.5983,0.5397,0.6403,https://huggingface.co/bartowski/Phi-3-medium-4k-instruct-GGUF
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: IFEval Leaderboard
3
+ emoji: πŸ†
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.15.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import argparse
3
+ import glob
4
+ import pickle
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import pandas as pd
9
+ def model_hyperlink(model_name, link):
10
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
11
+ def load_leaderboard_table_csv(filename, add_hyperlink=True):
12
+ lines = open(filename).readlines()
13
+ heads = [v.strip() for v in lines[0].split(",")]
14
+ rows = []
15
+ for i in range(1, len(lines)):
16
+ row = [v.strip() for v in lines[i].split(",")]
17
+ for j in range(len(heads)):
18
+ item = {}
19
+ for h, v in zip(heads, row):
20
+ if "Score" in h:
21
+ item[h] = float(v)
22
+ elif h != "Model" and h != "Parameters (B)" and h != "Repo" and h != "Quantization" and h != "Link":
23
+ item[h] = int(v)
24
+ else:
25
+ item[h] = v
26
+ if add_hyperlink:
27
+ item["Repo"] = model_hyperlink(item["Repo"], item["Link"])
28
+ rows.append(item)
29
+ return rows
30
+
31
+ def get_arena_table(model_table_df):
32
+ # sort by rating
33
+ model_table_df = model_table_df.sort_values(by=["Final Score"], ascending=False)
34
+ values = []
35
+ for i in range(len(model_table_df)):
36
+ row = []
37
+ model_key = model_table_df.index[i]
38
+ model_name = model_table_df["Model"].values[model_key]
39
+ # rank
40
+ row.append(i + 1)
41
+ # model display name
42
+ row.append(model_name)
43
+
44
+ row.append(
45
+ model_table_df["Parameters (B)"].values[model_key]
46
+ )
47
+ row.append(
48
+ model_table_df["Repo"].values[model_key]
49
+ )
50
+ row.append(
51
+ model_table_df["Quantization"].values[model_key]
52
+ )
53
+ row.append(
54
+ model_table_df["Final Score"].values[model_key]
55
+ )
56
+ row.append(
57
+ model_table_df["Strict Prompt Score"].values[model_key]
58
+ )
59
+ row.append(
60
+ model_table_df["Strict Inst Score"].values[model_key]
61
+ )
62
+ row.append(
63
+ model_table_df["Loose Prompt Score"].values[model_key]
64
+ )
65
+ row.append(
66
+ model_table_df["Loose Inst Score"].values[model_key]
67
+ )
68
+ values.append(row)
69
+ return values
70
+
71
+ def build_leaderboard_tab(leaderboard_table_file, show_plot=False):
72
+ if leaderboard_table_file:
73
+ data = load_leaderboard_table_csv(leaderboard_table_file)
74
+ model_table_df = pd.DataFrame(data)
75
+ md_head = f"""
76
+ # πŸ† IFEval Leaderboard
77
+ """
78
+ gr.Markdown(md_head, elem_id="leaderboard_markdown")
79
+ with gr.Tabs() as tabs:
80
+ # arena table
81
+ arena_table_vals = get_arena_table(model_table_df)
82
+ with gr.Tab("IFEval", id=0):
83
+ md = "Leaderboard for various Large Language Models measured with IFEval benchmark.\n\n[IFEval](https://github.com/google-research/google-research/tree/master/instruction_following_eval) is a straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set of \"verifiable instructions\" such as \"write in more than 400 words\" and \"mention the keyword of AI at least 3 times\". We identified 25 types of those verifiable instructions and constructed around 500 prompts, with each prompt containing one or more verifiable instructions. \n\nTest ran with `lm-evaluation-harness`. Raw results can be found in the `results` directory. Made by [Kristian Polso](https://polso.info)."
84
+ gr.Markdown(md, elem_id="leaderboard_markdown")
85
+ gr.Dataframe(
86
+ headers=[
87
+ "Rank",
88
+ "Model",
89
+ "Parameters (B)",
90
+ "Repo",
91
+ "Quantization",
92
+ "Final Score",
93
+ "Strict Prompt Score",
94
+ "Strict Inst Score",
95
+ "Loose Prompt Score",
96
+ "Loose Inst Score"
97
+ ],
98
+ datatype=[
99
+ "number",
100
+ "str",
101
+ "number",
102
+ "markdown",
103
+ "str",
104
+ "number",
105
+ "number",
106
+ "number",
107
+ "number",
108
+ "number"
109
+ ],
110
+ value=arena_table_vals,
111
+ elem_id="arena_leaderboard_dataframe",
112
+ height=700,
113
+ column_widths=[50, 150, 100, 150, 100, 100, 100, 100, 100, 100],
114
+ wrap=True,
115
+ )
116
+
117
+ else:
118
+ pass
119
+
120
+ def build_demo(leaderboard_table_file):
121
+ text_size = gr.themes.sizes.text_lg
122
+
123
+ with gr.Blocks(
124
+ title="IFEval Leaderboard",
125
+ theme=gr.themes.Base(text_size=text_size),
126
+ ) as demo:
127
+ leader_components = build_leaderboard_tab(
128
+ leaderboard_table_file, show_plot=True
129
+ )
130
+ return demo
131
+
132
+ if __name__ == "__main__":
133
+ parser = argparse.ArgumentParser()
134
+ parser.add_argument("--share", action="store_true")
135
+ parser.add_argument("--IFEval_file", type=str, default="./IFEval.csv")
136
+ args = parser.parse_args()
137
+
138
+ demo = build_demo(args.IFEval_file)
139
+ demo.launch()