hongfu_update_20250701

#19
by Jerry0723 - opened
.idea/workspace.xml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ChangeListManager">
4
+ <list default="true" id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
5
+ <option name="SHOW_DIALOG" value="false" />
6
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
7
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
8
+ <option name="LAST_RESOLUTION" value="IGNORE" />
9
+ </component>
10
+ <component name="Git.Settings">
11
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
12
+ </component>
13
+ <component name="MarkdownSettingsMigration">
14
+ <option name="stateVersion" value="1" />
15
+ </component>
16
+ <component name="ProjectColorInfo">{
17
+ &quot;customColor&quot;: &quot;&quot;,
18
+ &quot;associatedIndex&quot;: 2
19
+ }</component>
20
+ <component name="ProjectId" id="2zGmpeKAt5GZlNtHRIRD45uRoxd" />
21
+ <component name="ProjectViewState">
22
+ <option name="hideEmptyMiddlePackages" value="true" />
23
+ <option name="showLibraryContents" value="true" />
24
+ </component>
25
+ <component name="PropertiesComponent"><![CDATA[{
26
+ "keyToString": {
27
+ "RunOnceActivity.OpenProjectViewOnStart": "true",
28
+ "RunOnceActivity.ShowReadmeOnStart": "true",
29
+ "git-widget-placeholder": "pr/18",
30
+ "last_opened_file_path": "E:/pythonProject/ChineseSafe-Benchmark",
31
+ "nodejs_package_manager_path": "npm",
32
+ "vue.rearranger.settings.migration": "true"
33
+ }
34
+ }]]></component>
35
+ <component name="SharedIndexes">
36
+ <attachedChunks>
37
+ <set>
38
+ <option value="bundled-python-sdk-67fca87a943a-c986f194a52a-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.11799.259" />
39
+ </set>
40
+ </attachedChunks>
41
+ </component>
42
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
43
+ <component name="TaskManager">
44
+ <task active="true" id="Default" summary="Default task">
45
+ <changelist id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
46
+ <created>1751365967779</created>
47
+ <option name="number" value="Default" />
48
+ <option name="presentableId" value="Default" />
49
+ <updated>1751365967779</updated>
50
+ <workItem from="1751365968934" duration="39000" />
51
+ <workItem from="1751366116696" duration="54000" />
52
+ </task>
53
+ <servers />
54
+ </component>
55
+ <component name="TypeScriptGeneratedFilesManager">
56
+ <option name="version" value="3" />
57
+ </component>
58
+ </project>
app.py CHANGED
@@ -6,11 +6,13 @@ import pandas as pd
6
  from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
7
 
8
 
9
- ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", sep='\t') # space separated values
10
- ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", sep='\t') #
 
 
 
 
11
 
12
- ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
13
- ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
14
 
15
  METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
16
 
@@ -28,7 +30,7 @@ CLASSIFICATION = {
28
  "5B~10B",
29
  "API",
30
  ]
31
-
32
  }
33
 
34
 
@@ -36,13 +38,13 @@ CLASSIFICATION = {
36
 
37
  _BIBTEX = """
38
  @misc{zhang2024chinesesafechinesebenchmarkevaluating,
39
- title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
40
  author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
41
  year={2024},
42
  eprint={2410.18491},
43
  archivePrefix={arXiv},
44
  primaryClass={cs.CL},
45
- url={https://arxiv.org/abs/2410.18491},
46
  }
47
  """
48
 
@@ -62,8 +64,8 @@ def format_csv_numbers(text):
62
 
63
  def format_csv_numbers_second(text):
64
  return text.split()
65
-
66
-
67
  def format_number(x):
68
  return float(f"{x:.3}")
69
 
@@ -73,7 +75,7 @@ def get_dataset_csv(
73
  ):
74
  df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
75
  df = df.drop(columns="Size")
76
-
77
  leaderboard_table = gr.components.Dataframe(
78
  value=df,
79
  interactive=False,
@@ -101,11 +103,11 @@ def get_dataset_csv_sub_gen(
101
  ):
102
  df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
103
  df = df.drop(columns="Size")
104
-
105
  # get subclass
106
  subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
107
  df = df[subclass_choice_label]
108
-
109
  leaderboard_table = gr.components.Dataframe(
110
  value=df,
111
  interactive=False,
@@ -120,11 +122,11 @@ def get_dataset_csv_sub_per(
120
  ):
121
  df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
122
  df = df.drop(columns="Size")
123
-
124
  # get subclass
125
  subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
126
  df = df[subclass_choice_label]
127
-
128
  leaderboard_table = gr.components.Dataframe(
129
  value=df,
130
  interactive=False,
@@ -143,7 +145,7 @@ def get_dataset_classfier_gen(
143
  subclass_choice = main_choice
144
  leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
145
  return leaderboard_table
146
-
147
  def get_dataset_classfier_per(
148
  model_size: List[str],
149
  main_choice: List[str],
@@ -164,10 +166,10 @@ with gr.Blocks() as demo:
164
 
165
  with gr.Row():
166
  gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
167
-
168
  with gr.Row():
169
  gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
170
-
171
  with gr.Row():
172
  with gr.Column(scale=0.8):
173
  main_choice = gr.Dropdown(
@@ -176,8 +178,8 @@ with gr.Blocks() as demo:
176
  label="Type",
177
  info="Please choose the type to display.",
178
  )
179
-
180
- with gr.Column(scale=10):
181
  model_choice = gr.CheckboxGroup(
182
  choices=CLASSIFICATION["model_size"],
183
  value=CLASSIFICATION["model_size"], # all be choosed
@@ -188,12 +190,12 @@ with gr.Blocks() as demo:
188
  #πŸ‘‰ this part is for csv table generatived
189
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
190
  # ----------------- modify text -----------------
191
-
192
  with gr.TabItem("πŸ… Generation", elem_id="od-benchmark-tab-table", id=6):
193
  dataframe_all_gen = gr.components.Dataframe(
194
  elem_id="leaderboard-table",
195
  )
196
-
197
  with gr.TabItem("πŸ… Perplexity", elem_id="od-benchmark-tab-table", id=5):
198
  dataframe_all_per = gr.components.Dataframe(
199
  elem_id="leaderboard-table",
@@ -202,10 +204,10 @@ with gr.Blocks() as demo:
202
  # ----------------- modify text -----------------
203
  with gr.Row():
204
  gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
205
-
206
  with gr.Row():
207
  gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
208
-
209
  # πŸ‘‰ this part is for citation
210
  with gr.Row():
211
  with gr.Accordion("πŸ“™ Citation", open=True):
@@ -216,18 +218,18 @@ with gr.Blocks() as demo:
216
  elem_id="citation-button",
217
  show_copy_button=True
218
  )
219
-
220
  gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
221
-
222
  # --------------------------- all --------------------------------
223
  # this is all result Perplexity
224
-
225
  main_choice.change(
226
  get_dataset_classfier_per,
227
  inputs=[model_choice, main_choice],
228
  outputs=dataframe_all_per,
229
  )
230
-
231
  model_choice.change(
232
  get_dataset_classfier_per,
233
  inputs=[model_choice, main_choice],
@@ -239,26 +241,26 @@ with gr.Blocks() as demo:
239
  inputs=[model_choice, main_choice],
240
  outputs=dataframe_all_per,
241
  )
242
-
243
  # this is all result generatived
244
  main_choice.change(
245
  get_dataset_classfier_gen,
246
  inputs=[model_choice, main_choice],
247
  outputs=dataframe_all_gen,
248
  )
249
-
250
  model_choice.change(
251
  get_dataset_classfier_gen,
252
  inputs=[model_choice, main_choice],
253
  outputs=dataframe_all_gen,
254
  )
255
-
256
  demo.load(
257
  fn=get_dataset_classfier_gen,
258
  inputs=[model_choice, main_choice],
259
  outputs=dataframe_all_gen,
260
  )
261
-
262
-
263
  demo.launch(share=True)
264
 
 
6
  from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
7
 
8
 
9
+ ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
10
+ ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
11
+
12
+
13
+ ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
14
+ ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
15
 
 
 
16
 
17
  METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
18
 
 
30
  "5B~10B",
31
  "API",
32
  ]
33
+
34
  }
35
 
36
 
 
38
 
39
  _BIBTEX = """
40
  @misc{zhang2024chinesesafechinesebenchmarkevaluating,
41
+ title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
42
  author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
43
  year={2024},
44
  eprint={2410.18491},
45
  archivePrefix={arXiv},
46
  primaryClass={cs.CL},
47
+ url={https://arxiv.org/abs/2410.18491},
48
  }
49
  """
50
 
 
64
 
65
  def format_csv_numbers_second(text):
66
  return text.split()
67
+
68
+
69
  def format_number(x):
70
  return float(f"{x:.3}")
71
 
 
75
  ):
76
  df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
77
  df = df.drop(columns="Size")
78
+
79
  leaderboard_table = gr.components.Dataframe(
80
  value=df,
81
  interactive=False,
 
103
  ):
104
  df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
105
  df = df.drop(columns="Size")
106
+
107
  # get subclass
108
  subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
109
  df = df[subclass_choice_label]
110
+
111
  leaderboard_table = gr.components.Dataframe(
112
  value=df,
113
  interactive=False,
 
122
  ):
123
  df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
124
  df = df.drop(columns="Size")
125
+
126
  # get subclass
127
  subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
128
  df = df[subclass_choice_label]
129
+
130
  leaderboard_table = gr.components.Dataframe(
131
  value=df,
132
  interactive=False,
 
145
  subclass_choice = main_choice
146
  leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
147
  return leaderboard_table
148
+
149
  def get_dataset_classfier_per(
150
  model_size: List[str],
151
  main_choice: List[str],
 
166
 
167
  with gr.Row():
168
  gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
169
+
170
  with gr.Row():
171
  gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
172
+
173
  with gr.Row():
174
  with gr.Column(scale=0.8):
175
  main_choice = gr.Dropdown(
 
178
  label="Type",
179
  info="Please choose the type to display.",
180
  )
181
+
182
+ with gr.Column(scale=10):
183
  model_choice = gr.CheckboxGroup(
184
  choices=CLASSIFICATION["model_size"],
185
  value=CLASSIFICATION["model_size"], # all be choosed
 
190
  #πŸ‘‰ this part is for csv table generatived
191
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
192
  # ----------------- modify text -----------------
193
+
194
  with gr.TabItem("πŸ… Generation", elem_id="od-benchmark-tab-table", id=6):
195
  dataframe_all_gen = gr.components.Dataframe(
196
  elem_id="leaderboard-table",
197
  )
198
+
199
  with gr.TabItem("πŸ… Perplexity", elem_id="od-benchmark-tab-table", id=5):
200
  dataframe_all_per = gr.components.Dataframe(
201
  elem_id="leaderboard-table",
 
204
  # ----------------- modify text -----------------
205
  with gr.Row():
206
  gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
207
+
208
  with gr.Row():
209
  gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
210
+
211
  # πŸ‘‰ this part is for citation
212
  with gr.Row():
213
  with gr.Accordion("πŸ“™ Citation", open=True):
 
218
  elem_id="citation-button",
219
  show_copy_button=True
220
  )
221
+
222
  gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
223
+
224
  # --------------------------- all --------------------------------
225
  # this is all result Perplexity
226
+
227
  main_choice.change(
228
  get_dataset_classfier_per,
229
  inputs=[model_choice, main_choice],
230
  outputs=dataframe_all_per,
231
  )
232
+
233
  model_choice.change(
234
  get_dataset_classfier_per,
235
  inputs=[model_choice, main_choice],
 
241
  inputs=[model_choice, main_choice],
242
  outputs=dataframe_all_per,
243
  )
244
+
245
  # this is all result generatived
246
  main_choice.change(
247
  get_dataset_classfier_gen,
248
  inputs=[model_choice, main_choice],
249
  outputs=dataframe_all_gen,
250
  )
251
+
252
  model_choice.change(
253
  get_dataset_classfier_gen,
254
  inputs=[model_choice, main_choice],
255
  outputs=dataframe_all_gen,
256
  )
257
+
258
  demo.load(
259
  fn=get_dataset_classfier_gen,
260
  inputs=[model_choice, main_choice],
261
  outputs=dataframe_all_gen,
262
  )
263
+
264
+
265
  demo.launch(share=True)
266
 
data/chinese_benchmark_per.csv CHANGED
@@ -43,4 +43,4 @@ Opt-6.7B,5B~10B,48.54/0.43,49.24/0.31,86.62/1.03,43.40/1.18,10.30/0.55
43
  Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
44
  Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
45
  Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
46
- Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
 
43
  Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
44
  Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
45
  Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
46
+ Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
data/subclass_gen.csv CHANGED
@@ -48,4 +48,4 @@ Opt-30B,~30B,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.53
48
  QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
49
  Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
50
  Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
51
- OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516
 
48
  QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
49
  Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
50
  Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
51
+ OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516
data/subclass_per.csv CHANGED
@@ -41,4 +41,4 @@ Opt-30B,~30B,0.5831,0.5754,0.5565,0.3952,0.338,0.1915,0.6784,0.6507,0.7506,0.579
41
  QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
42
  Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
43
  OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
44
- Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814
 
41
  QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
42
  Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
43
  OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
44
+ Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814