hongfu_update_20250701 (#19)
Browse files- change encoding (9fd9d7365fbd38a77397694bc18c8d0d1ca59888)
Co-authored-by: GAO <[email protected]>
- .idea/workspace.xml +58 -0
- app.py +34 -32
- data/chinese_benchmark_per.csv +1 -1
- data/subclass_gen.csv +1 -1
- data/subclass_per.csv +1 -1
.idea/workspace.xml
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ChangeListManager">
|
4 |
+
<list default="true" id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
|
5 |
+
<option name="SHOW_DIALOG" value="false" />
|
6 |
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
7 |
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
8 |
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
9 |
+
</component>
|
10 |
+
<component name="Git.Settings">
|
11 |
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
12 |
+
</component>
|
13 |
+
<component name="MarkdownSettingsMigration">
|
14 |
+
<option name="stateVersion" value="1" />
|
15 |
+
</component>
|
16 |
+
<component name="ProjectColorInfo">{
|
17 |
+
"customColor": "",
|
18 |
+
"associatedIndex": 2
|
19 |
+
}</component>
|
20 |
+
<component name="ProjectId" id="2zGmpeKAt5GZlNtHRIRD45uRoxd" />
|
21 |
+
<component name="ProjectViewState">
|
22 |
+
<option name="hideEmptyMiddlePackages" value="true" />
|
23 |
+
<option name="showLibraryContents" value="true" />
|
24 |
+
</component>
|
25 |
+
<component name="PropertiesComponent"><![CDATA[{
|
26 |
+
"keyToString": {
|
27 |
+
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
28 |
+
"RunOnceActivity.ShowReadmeOnStart": "true",
|
29 |
+
"git-widget-placeholder": "pr/18",
|
30 |
+
"last_opened_file_path": "E:/pythonProject/ChineseSafe-Benchmark",
|
31 |
+
"nodejs_package_manager_path": "npm",
|
32 |
+
"vue.rearranger.settings.migration": "true"
|
33 |
+
}
|
34 |
+
}]]></component>
|
35 |
+
<component name="SharedIndexes">
|
36 |
+
<attachedChunks>
|
37 |
+
<set>
|
38 |
+
<option value="bundled-python-sdk-67fca87a943a-c986f194a52a-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.11799.259" />
|
39 |
+
</set>
|
40 |
+
</attachedChunks>
|
41 |
+
</component>
|
42 |
+
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
43 |
+
<component name="TaskManager">
|
44 |
+
<task active="true" id="Default" summary="Default task">
|
45 |
+
<changelist id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
|
46 |
+
<created>1751365967779</created>
|
47 |
+
<option name="number" value="Default" />
|
48 |
+
<option name="presentableId" value="Default" />
|
49 |
+
<updated>1751365967779</updated>
|
50 |
+
<workItem from="1751365968934" duration="39000" />
|
51 |
+
<workItem from="1751366116696" duration="54000" />
|
52 |
+
</task>
|
53 |
+
<servers />
|
54 |
+
</component>
|
55 |
+
<component name="TypeScriptGeneratedFilesManager">
|
56 |
+
<option name="version" value="3" />
|
57 |
+
</component>
|
58 |
+
</project>
|
app.py
CHANGED
@@ -6,11 +6,13 @@ import pandas as pd
|
|
6 |
from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
|
7 |
|
8 |
|
9 |
-
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv",
|
10 |
-
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv",
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
|
13 |
-
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
|
14 |
|
15 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
16 |
|
@@ -28,7 +30,7 @@ CLASSIFICATION = {
|
|
28 |
"5B~10B",
|
29 |
"API",
|
30 |
]
|
31 |
-
|
32 |
}
|
33 |
|
34 |
|
@@ -36,13 +38,13 @@ CLASSIFICATION = {
|
|
36 |
|
37 |
_BIBTEX = """
|
38 |
@misc{zhang2024chinesesafechinesebenchmarkevaluating,
|
39 |
-
title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
|
40 |
author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
|
41 |
year={2024},
|
42 |
eprint={2410.18491},
|
43 |
archivePrefix={arXiv},
|
44 |
primaryClass={cs.CL},
|
45 |
-
url={https://arxiv.org/abs/2410.18491},
|
46 |
}
|
47 |
"""
|
48 |
|
@@ -62,8 +64,8 @@ def format_csv_numbers(text):
|
|
62 |
|
63 |
def format_csv_numbers_second(text):
|
64 |
return text.split()
|
65 |
-
|
66 |
-
|
67 |
def format_number(x):
|
68 |
return float(f"{x:.3}")
|
69 |
|
@@ -73,7 +75,7 @@ def get_dataset_csv(
|
|
73 |
):
|
74 |
df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
|
75 |
df = df.drop(columns="Size")
|
76 |
-
|
77 |
leaderboard_table = gr.components.Dataframe(
|
78 |
value=df,
|
79 |
interactive=False,
|
@@ -101,11 +103,11 @@ def get_dataset_csv_sub_gen(
|
|
101 |
):
|
102 |
df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
|
103 |
df = df.drop(columns="Size")
|
104 |
-
|
105 |
# get subclass
|
106 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
107 |
df = df[subclass_choice_label]
|
108 |
-
|
109 |
leaderboard_table = gr.components.Dataframe(
|
110 |
value=df,
|
111 |
interactive=False,
|
@@ -120,11 +122,11 @@ def get_dataset_csv_sub_per(
|
|
120 |
):
|
121 |
df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
|
122 |
df = df.drop(columns="Size")
|
123 |
-
|
124 |
# get subclass
|
125 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
126 |
df = df[subclass_choice_label]
|
127 |
-
|
128 |
leaderboard_table = gr.components.Dataframe(
|
129 |
value=df,
|
130 |
interactive=False,
|
@@ -143,7 +145,7 @@ def get_dataset_classfier_gen(
|
|
143 |
subclass_choice = main_choice
|
144 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
145 |
return leaderboard_table
|
146 |
-
|
147 |
def get_dataset_classfier_per(
|
148 |
model_size: List[str],
|
149 |
main_choice: List[str],
|
@@ -164,10 +166,10 @@ with gr.Blocks() as demo:
|
|
164 |
|
165 |
with gr.Row():
|
166 |
gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
|
167 |
-
|
168 |
with gr.Row():
|
169 |
gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
|
170 |
-
|
171 |
with gr.Row():
|
172 |
with gr.Column(scale=0.8):
|
173 |
main_choice = gr.Dropdown(
|
@@ -176,8 +178,8 @@ with gr.Blocks() as demo:
|
|
176 |
label="Type",
|
177 |
info="Please choose the type to display.",
|
178 |
)
|
179 |
-
|
180 |
-
with gr.Column(scale=10):
|
181 |
model_choice = gr.CheckboxGroup(
|
182 |
choices=CLASSIFICATION["model_size"],
|
183 |
value=CLASSIFICATION["model_size"], # all be choosed
|
@@ -188,12 +190,12 @@ with gr.Blocks() as demo:
|
|
188 |
#π this part is for csv table generatived
|
189 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
190 |
# ----------------- modify text -----------------
|
191 |
-
|
192 |
with gr.TabItem("π
Generation", elem_id="od-benchmark-tab-table", id=6):
|
193 |
dataframe_all_gen = gr.components.Dataframe(
|
194 |
elem_id="leaderboard-table",
|
195 |
)
|
196 |
-
|
197 |
with gr.TabItem("π
Perplexity", elem_id="od-benchmark-tab-table", id=5):
|
198 |
dataframe_all_per = gr.components.Dataframe(
|
199 |
elem_id="leaderboard-table",
|
@@ -202,10 +204,10 @@ with gr.Blocks() as demo:
|
|
202 |
# ----------------- modify text -----------------
|
203 |
with gr.Row():
|
204 |
gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
|
205 |
-
|
206 |
with gr.Row():
|
207 |
gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
|
208 |
-
|
209 |
# π this part is for citation
|
210 |
with gr.Row():
|
211 |
with gr.Accordion("π Citation", open=True):
|
@@ -216,18 +218,18 @@ with gr.Blocks() as demo:
|
|
216 |
elem_id="citation-button",
|
217 |
show_copy_button=True
|
218 |
)
|
219 |
-
|
220 |
gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
|
221 |
-
|
222 |
# --------------------------- all --------------------------------
|
223 |
# this is all result Perplexity
|
224 |
-
|
225 |
main_choice.change(
|
226 |
get_dataset_classfier_per,
|
227 |
inputs=[model_choice, main_choice],
|
228 |
outputs=dataframe_all_per,
|
229 |
)
|
230 |
-
|
231 |
model_choice.change(
|
232 |
get_dataset_classfier_per,
|
233 |
inputs=[model_choice, main_choice],
|
@@ -239,26 +241,26 @@ with gr.Blocks() as demo:
|
|
239 |
inputs=[model_choice, main_choice],
|
240 |
outputs=dataframe_all_per,
|
241 |
)
|
242 |
-
|
243 |
# this is all result generatived
|
244 |
main_choice.change(
|
245 |
get_dataset_classfier_gen,
|
246 |
inputs=[model_choice, main_choice],
|
247 |
outputs=dataframe_all_gen,
|
248 |
)
|
249 |
-
|
250 |
model_choice.change(
|
251 |
get_dataset_classfier_gen,
|
252 |
inputs=[model_choice, main_choice],
|
253 |
outputs=dataframe_all_gen,
|
254 |
)
|
255 |
-
|
256 |
demo.load(
|
257 |
fn=get_dataset_classfier_gen,
|
258 |
inputs=[model_choice, main_choice],
|
259 |
outputs=dataframe_all_gen,
|
260 |
)
|
261 |
-
|
262 |
-
|
263 |
demo.launch(share=True)
|
264 |
|
|
|
6 |
from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
|
7 |
|
8 |
|
9 |
+
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
|
10 |
+
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
|
11 |
+
|
12 |
+
|
13 |
+
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
|
14 |
+
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
|
15 |
|
|
|
|
|
16 |
|
17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
18 |
|
|
|
30 |
"5B~10B",
|
31 |
"API",
|
32 |
]
|
33 |
+
|
34 |
}
|
35 |
|
36 |
|
|
|
38 |
|
39 |
_BIBTEX = """
|
40 |
@misc{zhang2024chinesesafechinesebenchmarkevaluating,
|
41 |
+
title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
|
42 |
author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
|
43 |
year={2024},
|
44 |
eprint={2410.18491},
|
45 |
archivePrefix={arXiv},
|
46 |
primaryClass={cs.CL},
|
47 |
+
url={https://arxiv.org/abs/2410.18491},
|
48 |
}
|
49 |
"""
|
50 |
|
|
|
64 |
|
65 |
def format_csv_numbers_second(text):
|
66 |
return text.split()
|
67 |
+
|
68 |
+
|
69 |
def format_number(x):
|
70 |
return float(f"{x:.3}")
|
71 |
|
|
|
75 |
):
|
76 |
df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
|
77 |
df = df.drop(columns="Size")
|
78 |
+
|
79 |
leaderboard_table = gr.components.Dataframe(
|
80 |
value=df,
|
81 |
interactive=False,
|
|
|
103 |
):
|
104 |
df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
|
105 |
df = df.drop(columns="Size")
|
106 |
+
|
107 |
# get subclass
|
108 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
109 |
df = df[subclass_choice_label]
|
110 |
+
|
111 |
leaderboard_table = gr.components.Dataframe(
|
112 |
value=df,
|
113 |
interactive=False,
|
|
|
122 |
):
|
123 |
df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
|
124 |
df = df.drop(columns="Size")
|
125 |
+
|
126 |
# get subclass
|
127 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
128 |
df = df[subclass_choice_label]
|
129 |
+
|
130 |
leaderboard_table = gr.components.Dataframe(
|
131 |
value=df,
|
132 |
interactive=False,
|
|
|
145 |
subclass_choice = main_choice
|
146 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
147 |
return leaderboard_table
|
148 |
+
|
149 |
def get_dataset_classfier_per(
|
150 |
model_size: List[str],
|
151 |
main_choice: List[str],
|
|
|
166 |
|
167 |
with gr.Row():
|
168 |
gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
|
169 |
+
|
170 |
with gr.Row():
|
171 |
gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
|
172 |
+
|
173 |
with gr.Row():
|
174 |
with gr.Column(scale=0.8):
|
175 |
main_choice = gr.Dropdown(
|
|
|
178 |
label="Type",
|
179 |
info="Please choose the type to display.",
|
180 |
)
|
181 |
+
|
182 |
+
with gr.Column(scale=10):
|
183 |
model_choice = gr.CheckboxGroup(
|
184 |
choices=CLASSIFICATION["model_size"],
|
185 |
value=CLASSIFICATION["model_size"], # all be choosed
|
|
|
190 |
#π this part is for csv table generatived
|
191 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
192 |
# ----------------- modify text -----------------
|
193 |
+
|
194 |
with gr.TabItem("π
Generation", elem_id="od-benchmark-tab-table", id=6):
|
195 |
dataframe_all_gen = gr.components.Dataframe(
|
196 |
elem_id="leaderboard-table",
|
197 |
)
|
198 |
+
|
199 |
with gr.TabItem("π
Perplexity", elem_id="od-benchmark-tab-table", id=5):
|
200 |
dataframe_all_per = gr.components.Dataframe(
|
201 |
elem_id="leaderboard-table",
|
|
|
204 |
# ----------------- modify text -----------------
|
205 |
with gr.Row():
|
206 |
gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
|
207 |
+
|
208 |
with gr.Row():
|
209 |
gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
|
210 |
+
|
211 |
# π this part is for citation
|
212 |
with gr.Row():
|
213 |
with gr.Accordion("π Citation", open=True):
|
|
|
218 |
elem_id="citation-button",
|
219 |
show_copy_button=True
|
220 |
)
|
221 |
+
|
222 |
gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
|
223 |
+
|
224 |
# --------------------------- all --------------------------------
|
225 |
# this is all result Perplexity
|
226 |
+
|
227 |
main_choice.change(
|
228 |
get_dataset_classfier_per,
|
229 |
inputs=[model_choice, main_choice],
|
230 |
outputs=dataframe_all_per,
|
231 |
)
|
232 |
+
|
233 |
model_choice.change(
|
234 |
get_dataset_classfier_per,
|
235 |
inputs=[model_choice, main_choice],
|
|
|
241 |
inputs=[model_choice, main_choice],
|
242 |
outputs=dataframe_all_per,
|
243 |
)
|
244 |
+
|
245 |
# this is all result generatived
|
246 |
main_choice.change(
|
247 |
get_dataset_classfier_gen,
|
248 |
inputs=[model_choice, main_choice],
|
249 |
outputs=dataframe_all_gen,
|
250 |
)
|
251 |
+
|
252 |
model_choice.change(
|
253 |
get_dataset_classfier_gen,
|
254 |
inputs=[model_choice, main_choice],
|
255 |
outputs=dataframe_all_gen,
|
256 |
)
|
257 |
+
|
258 |
demo.load(
|
259 |
fn=get_dataset_classfier_gen,
|
260 |
inputs=[model_choice, main_choice],
|
261 |
outputs=dataframe_all_gen,
|
262 |
)
|
263 |
+
|
264 |
+
|
265 |
demo.launch(share=True)
|
266 |
|
data/chinese_benchmark_per.csv
CHANGED
@@ -43,4 +43,4 @@ Opt-6.7B,5B~10B,48.54/0.43,49.24/0.31,86.62/1.03,43.40/1.18,10.30/0.55
|
|
43 |
Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
|
44 |
Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
|
45 |
Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
|
46 |
-
Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
|
|
|
43 |
Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
|
44 |
Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
|
45 |
Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
|
46 |
+
Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
|
data/subclass_gen.csv
CHANGED
@@ -48,4 +48,4 @@ Opt-30B,~30B,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.53
|
|
48 |
QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
|
49 |
Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
|
50 |
Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
|
51 |
-
OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516
|
|
|
48 |
QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
|
49 |
Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
|
50 |
Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
|
51 |
+
OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516
|
data/subclass_per.csv
CHANGED
@@ -41,4 +41,4 @@ Opt-30B,~30B,0.5831,0.5754,0.5565,0.3952,0.338,0.1915,0.6784,0.6507,0.7506,0.579
|
|
41 |
QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
|
42 |
Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
|
43 |
OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
|
44 |
-
Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814
|
|
|
41 |
QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
|
42 |
Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
|
43 |
OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
|
44 |
+
Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814
|