Upload chinese_benchmark_gen.csv
#16
by
Jerry0723
- opened
- data/chinese_benchmark_gen.csv +54 -43
data/chinese_benchmark_gen.csv
CHANGED
@@ -1,43 +1,54 @@
|
|
1 |
-
Model
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
Baichuan2-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
Phi-3-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Size,Accuracy/std,Precision_Unsafe/std,Recall_Unsafe/std,Precision_Safe/std,Recall_Safe/std
|
2 |
+
GPT-4o,API,73.78/0.30,97.75/0.13,48.66/0.04,65.84/0.55,98.88/0.04
|
3 |
+
GPT-4-Turbo,API,71.67/0.17,80.13/0.64,57.59/0.69,66.93/0.44,85.74/0.35
|
4 |
+
Pespective,API,69.28/0.32,69.96/0.79,67.49/0.32,68.64/0.32,71.06/0.43
|
5 |
+
GPT-3.5,API,64.70/0.44,76.12/0.55,42.79/0.64,60.24/0.76,86.59/0.32
|
6 |
+
Gemini-2.5-flash-preview-05-20,API,71.27/0.27,73.40/0.23,70.16/0.71,69.17/0.53,72.48/0.40
|
7 |
+
Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
|
8 |
+
Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
|
9 |
+
Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
|
10 |
+
Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
|
11 |
+
Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
|
12 |
+
DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38
|
13 |
+
GLM-4-9B-Chat,5B~10B,70.96/0.23,82.15/0.55,53.73/0.48,65.50/0.18,88.27/0.41
|
14 |
+
Mistral-7B-Instruct-v0.3,5B~10B,70.41/0.41,68.55/0.52,75.67/0.22,72.71/0.26,65.12/0.58
|
15 |
+
Qwen1.5-7B-Chat,5B~10B,70.36/0.39,64.66/0.27,90.09/0.57,83.55/0.82,50.53/0.18
|
16 |
+
Phi-3-small-128k-instruct,5B~10B,67.43/0.26,72.10/0.54,57.35/0.17,64.33/0.09,77.61/0.43
|
17 |
+
Ministral-8B-Instruct-2410,5B~10B,62.32/0.01,62.71/0.19,61.60/0.29,61.94/0.19,63.05/0.28
|
18 |
+
Yi-1.5-9B-Chat,5B~10B,62.12/0.38,64.42/0.42,54.53/0.43,60.43/0.36,69.75/0.37
|
19 |
+
Llama3-ChatQA-1.5-8B,5B~10B,61.28/0.40,57.63/0.20,85.84/0.43,72.02/0.95,36.61/0.54
|
20 |
+
Baichuan2-7B-Chat,5B~10B,59.43/0.24,72.06/0.66,31.11/0.40,55.95/0.12,87.89/0.20
|
21 |
+
InternLM2-chat-7B,5B~10B,58.79/0.09,62.70/0.19,43.88/0.17,56.68/0.14,73.77/0.13
|
22 |
+
GPT-J-6B,5B~10B,52.65/0.32,52.42/0.32,62.00/0.42,52.99/0.37,43.21/0.92
|
23 |
+
Opt-6.7B,5B~10B,50.00/0.11,50.17/0.17,64.70/0.35,49.69/0.04,35.18/0.44
|
24 |
+
Qwen3-4B,5B~10B,74.95/0.01,76.47/0.01,72.10/0.00,73.61/0.01,77.81/0.01
|
25 |
+
Gemma-3-4B-it,5B~10B,71.41/0.00,66.54/0.00,86.12/0.00,80.33/0.00,56.70/0.00
|
26 |
+
phi-4,10B~20B,72.24/0.24,76.59/0.46,64.42/0.51,69.06/0.15,80.13/0.62
|
27 |
+
InternLM2-Chat-20B,10B~20B,70.21/0.55,73.30/0.70,63.79/0.43,67.82/0.45,76.65/0.67
|
28 |
+
Qwen1.5-14B-Chat,10B~20B,68.25/0.44,65.87/0.37,76.02/0.72,71.51/0.59,60.44/0.20
|
29 |
+
Phi-3-medium-128k-instruct,10B~20B,64.30/0.06,63.89/0.13,66.53/0.52,64.76/0.26,62.05/0.42
|
30 |
+
Baichuan2-13B-Chat,10B~20B,62.86/0.31,64.17/0.33,58.61/0.80,61.75/0.30,67.13/0.56
|
31 |
+
Mistral-Nemo-Instruct-2407,10B~20B,59.71/0.45,61.79/0.52,51.82/0.48,58.20/0.44,67.68/0.44
|
32 |
+
Phi-3-medium-4k-instruct,10B~20B,57.79/0.45,58.69/0.37,53.88/0.62,57.02/0.55,61.74/0.55
|
33 |
+
Ziya2-13B-Chat,10B~20B,53.40/0.43,53.33/0.38,56.18/0.41,53.48/0.53,50.62/0.61
|
34 |
+
Opt-13B,10B~20B,50.18/0.26,50.29/0.20,69.97/0.37,49.94/0.47,30.22/0.31
|
35 |
+
Moonlight-16B-A3B-Instruct,10B~20B,45.16/0.43,44.16/0.64,34.79/0.67,45.82/0.33,55.62/0.35
|
36 |
+
Qwen3-14B,10B~20B,68.54/0.01,67.24/0.01,72.29/0.00,70.04/0.00,64.78/0.01
|
37 |
+
Gemma-3-12B-it,10B~20B,65.63/0.00,62.69/0.00,77.18/0.00,70.32/0.00,54.07/0.00
|
38 |
+
DeepSeek-LLM-67B-Chat,>65B,76.76/0.35,73.40/0.37,84.26/0.40,81.34/0.35,69.19/0.64
|
39 |
+
Llama3-ChatQA-1.5-70B,>65B,65.29/0.29,66.24/0.50,62.92/0.12,64.43/0.19,67.69/0.63
|
40 |
+
Qwen2.5-72B-Instruct,>65B,63.41/0.77,66.00/0.95,56.00/0.62,61.49/0.65,70.90/0.96
|
41 |
+
Qwen1.5-72B-Chat,>65B,62.91/0.50,73.86/0.84,40.46/0.97,58.75/0.35,85.55/0.62
|
42 |
+
Opt-66B,>65B,54.46/0.17,53.22/0.06,76.94/0.24,57.73/0.49,31.77/0.28
|
43 |
+
Qwen2-72B-Instruct,>65B,54.08/0.20,58.10/0.60,30.72/0.45,52.63/0.05,77.65/0.36
|
44 |
+
DeepSeek-R1-Distill-Llama-70B,>65B,52.93/0.18,59.69/0.47,19.33/0.38,51.62/0.16,86.83/0.18
|
45 |
+
Llama-3.1-70B-Instruct,>65B,52.84/0.38,59.07/1.22,19.82/0.85,51.57/0.24,86.14/0.58
|
46 |
+
Llama-3.3-70B-Instruct,>65B,50.87/0.07,54.51/0.86,13.19/0.10,50.37/0.06,88.89/0.39
|
47 |
+
Qwen3-32B,>65B,75.26/0.00,89.11/0.00,57.55/0.0,68.65/0.00,92.97/0.00
|
48 |
+
Qwen2.5-32B-Instruct,~30B,69.64/0.39,92.13/0.45,43.24/0.83,62.70/0.25,96.27/0.20
|
49 |
+
QwQ-32B-Preview,~30B,69.55/0.28,75.97/0.48,57.60/0.27,65.61/0.17,81.62/0.33
|
50 |
+
Mistral-Small-24B-Instruct-2501,~30B,64.48/0.17,64.61/0.35,64.71/0.72,64.34/0.00,64.23/1.04
|
51 |
+
Yi-1.5-34B-Chat,~30B,60.06/0.43,58.14/0.40,72.51/0.55,63.27/0.56,47.56/0.42
|
52 |
+
Opt-30B,~30B,50.88/0.11,50.76/0.12,72.95/0.16,51.18/0.26,28.62/0.28
|
53 |
+
Gemma-3-27B-it,~30B,68.50/0.00,68.37/0.00,68.84/0.00,68.62/0.00,68.15/0.00
|
54 |
+
OpenThinker2-32B,~30B,65.01/0.01,74.90/0.01,45.13/0.01,60.74/0.01,84.87/0.00
|