Upload chinese_benchmark_gen.csv

#16
by Jerry0723 - opened
Files changed (1) hide show
  1. data/chinese_benchmark_gen.csv +54 -43
data/chinese_benchmark_gen.csv CHANGED
@@ -1,43 +1,54 @@
1
- Model Size Accuracy/std Precision_Unsafe/std Recall_Unsafe/std Precision_Safe/std Recall_Safe/std
2
- DeepSeek-LLM-67B-Chat >65B 76.76/0.35 73.40/0.37 84.26/0.40 81.34/0.35 69.19/0.64
3
- Llama3-ChatQA-1.5-70B >65B 65.29/0.29 66.24/0.50 62.92/0.12 64.43/0.19 67.69/0.63
4
- Qwen2.5-72B-Instruct >65B 63.41/0.77 66.00/0.95 56.00/0.62 61.49/0.65 70.90/0.96
5
- Qwen1.5-72B-Chat >65B 62.91/0.50 73.86/0.84 40.46/0.97 58.75/0.35 85.55/0.62
6
- Opt-66B >65B 54.46/0.17 53.22/0.06 76.94/0.24 57.73/0.49 31.77/0.28
7
- Qwen2-72B-Instruct >65B 54.08/0.20 58.10/0.60 30.72/0.45 52.63/0.05 77.65/0.36
8
- DeepSeek-R1-Distill-Llama-70B >65B 52.93/0.18 59.69/0.47 19.33/0.38 51.62/0.16 86.83/0.18
9
- Llama-3.1-70B-Instruct >65B 52.84/0.38 59.07/1.22 19.82/0.85 51.57/0.24 86.14/0.58
10
- Llama-3.3-70B-Instruct >65B 50.87/0.07 54.51/0.86 13.19/0.10 50.37/0.06 88.89/0.39
11
- Qwen2.5-32B-Instruct ~30B 69.64/0.39 92.13/0.45 43.24/0.83 62.70/0.25 96.27/0.20
12
- QwQ-32B-Preview ~30B 69.55/0.28 75.97/0.48 57.60/0.27 65.61/0.17 81.62/0.33
13
- Mistral-Small-24B-Instruct-2501 ~30B 64.48/0.17 64.61/0.35 64.71/0.72 64.34/0.00 64.23/1.04
14
- Yi-1.5-34B-Chat ~30B 60.06/0.43 58.14/0.40 72.51/0.55 63.27/0.56 47.56/0.42
15
- Opt-30B ~30B 50.88/0.11 50.76/0.12 72.95/0.16 51.18/0.26 28.62/0.28
16
- phi-4 10B~20B 72.24/0.24 76.59/0.46 64.42/0.51 69.06/0.15 80.13/0.62
17
- InternLM2-Chat-20B 10B~20B 70.21/0.55 73.30/0.70 63.79/0.43 67.82/0.45 76.65/0.67
18
- Qwen1.5-14B-Chat 10B~20B 68.25/0.44 65.87/0.37 76.02/0.72 71.51/0.59 60.44/0.20
19
- Phi-3-medium-128k-instruct 10B~20B 64.30/0.06 63.89/0.13 66.53/0.52 64.76/0.26 62.05/0.42
20
- Baichuan2-13B-Chat 10B~20B 62.86/0.31 64.17/0.33 58.61/0.80 61.75/0.30 67.13/0.56
21
- Mistral-Nemo-Instruct-2407 10B~20B 59.71/0.45 61.79/0.52 51.82/0.48 58.20/0.44 67.68/0.44
22
- Phi-3-medium-4k-instruct 10B~20B 57.79/0.45 58.69/0.37 53.88/0.62 57.02/0.55 61.74/0.55
23
- Ziya2-13B-Chat 10B~20B 53.40/0.43 53.33/0.38 56.18/0.41 53.48/0.53 50.62/0.61
24
- Opt-13B 10B~20B 50.18/0.26 50.29/0.20 69.97/0.37 49.94/0.47 30.22/0.31
25
- Moonlight-16B-A3B-Instruct 10B~20B 45.16/0.43 44.16/0.64 34.79/0.67 45.82/0.33 55.62/0.35
26
- Phi-3-small-8k-instruct 5B~10B 72.73/0.47 73.67/0.63 71.12/0.49 71.85/0.35 74.36/0.59
27
- Gemma-1.1-7B-it 5B~10B 71.70/0.26 68.66/0.37 80.11/0.05 76.00/0.09 63.26/0.47
28
- DeepSeek-LLM-7B-Chat 5B~10B 71.63/0.17 69.50/0.15 77.33/0.67 74.33/0.41 65.90/0.38
29
- GLM-4-9B-Chat 5B~10B 70.96/0.23 82.15/0.55 53.73/0.48 65.50/0.18 88.27/0.41
30
- Mistral-7B-Instruct-v0.3 5B~10B 70.41/0.41 68.55/0.52 75.67/0.22 72.71/0.26 65.12/0.58
31
- Qwen1.5-7B-Chat 5B~10B 70.36/0.39 64.66/0.27 90.09/0.57 83.55/0.82 50.53/0.18
32
- Phi-3-small-128k-instruct 5B~10B 67.43/0.26 72.10/0.54 57.35/0.17 64.33/0.09 77.61/0.43
33
- Ministral-8B-Instruct-2410 5B~10B 62.32/0.01 62.71/0.19 61.60/0.29 61.94/0.19 63.05/0.28
34
- Yi-1.5-9B-Chat 5B~10B 62.12/0.38 64.42/0.42 54.53/0.43 60.43/0.36 69.75/0.37
35
- Llama3-ChatQA-1.5-8B 5B~10B 61.28/0.40 57.63/0.20 85.84/0.43 72.02/0.95 36.61/0.54
36
- Baichuan2-7B-Chat 5B~10B 59.43/0.24 72.06/0.66 31.11/0.40 55.95/0.12 87.89/0.20
37
- InternLM2-chat-7B 5B~10B 58.79/0.09 62.70/0.19 43.88/0.17 56.68/0.14 73.77/0.13
38
- GPT-J-6B 5B~10B 52.65/0.32 52.42/0.32 62.00/0.42 52.99/0.37 43.21/0.92
39
- Opt-6.7B 5B~10B 50.00/0.11 50.17/0.17 64.70/0.35 49.69/0.04 35.18/0.44
40
- GPT-4o API 73.78/0.30 97.75/0.13 48.66/0.04 65.84/0.55 98.88/0.04
41
- GPT-4-Turbo API 71.67/0.17 80.13/0.64 57.59/0.69 66.93/0.44 85.74/0.35
42
- Pespective API 69.28/0.32 69.96/0.79 67.49/0.32 68.64/0.32 71.06/0.43
43
- GPT-3.5 API 64.70/0.44 76.12/0.55 42.79/0.64 60.24/0.76 86.59/0.32
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Size,Accuracy/std,Precision_Unsafe/std,Recall_Unsafe/std,Precision_Safe/std,Recall_Safe/std
2
+ GPT-4o,API,73.78/0.30,97.75/0.13,48.66/0.04,65.84/0.55,98.88/0.04
3
+ GPT-4-Turbo,API,71.67/0.17,80.13/0.64,57.59/0.69,66.93/0.44,85.74/0.35
4
+ Pespective,API,69.28/0.32,69.96/0.79,67.49/0.32,68.64/0.32,71.06/0.43
5
+ GPT-3.5,API,64.70/0.44,76.12/0.55,42.79/0.64,60.24/0.76,86.59/0.32
6
+ Gemini-2.5-flash-preview-05-20,API,71.27/0.27,73.40/0.23,70.16/0.71,69.17/0.53,72.48/0.40
7
+ Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
8
+ Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
9
+ Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
10
+ Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
11
+ Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
12
+ DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38
13
+ GLM-4-9B-Chat,5B~10B,70.96/0.23,82.15/0.55,53.73/0.48,65.50/0.18,88.27/0.41
14
+ Mistral-7B-Instruct-v0.3,5B~10B,70.41/0.41,68.55/0.52,75.67/0.22,72.71/0.26,65.12/0.58
15
+ Qwen1.5-7B-Chat,5B~10B,70.36/0.39,64.66/0.27,90.09/0.57,83.55/0.82,50.53/0.18
16
+ Phi-3-small-128k-instruct,5B~10B,67.43/0.26,72.10/0.54,57.35/0.17,64.33/0.09,77.61/0.43
17
+ Ministral-8B-Instruct-2410,5B~10B,62.32/0.01,62.71/0.19,61.60/0.29,61.94/0.19,63.05/0.28
18
+ Yi-1.5-9B-Chat,5B~10B,62.12/0.38,64.42/0.42,54.53/0.43,60.43/0.36,69.75/0.37
19
+ Llama3-ChatQA-1.5-8B,5B~10B,61.28/0.40,57.63/0.20,85.84/0.43,72.02/0.95,36.61/0.54
20
+ Baichuan2-7B-Chat,5B~10B,59.43/0.24,72.06/0.66,31.11/0.40,55.95/0.12,87.89/0.20
21
+ InternLM2-chat-7B,5B~10B,58.79/0.09,62.70/0.19,43.88/0.17,56.68/0.14,73.77/0.13
22
+ GPT-J-6B,5B~10B,52.65/0.32,52.42/0.32,62.00/0.42,52.99/0.37,43.21/0.92
23
+ Opt-6.7B,5B~10B,50.00/0.11,50.17/0.17,64.70/0.35,49.69/0.04,35.18/0.44
24
+ Qwen3-4B,5B~10B,74.95/0.01,76.47/0.01,72.10/0.00,73.61/0.01,77.81/0.01
25
+ Gemma-3-4B-it,5B~10B,71.41/0.00,66.54/0.00,86.12/0.00,80.33/0.00,56.70/0.00
26
+ phi-4,10B~20B,72.24/0.24,76.59/0.46,64.42/0.51,69.06/0.15,80.13/0.62
27
+ InternLM2-Chat-20B,10B~20B,70.21/0.55,73.30/0.70,63.79/0.43,67.82/0.45,76.65/0.67
28
+ Qwen1.5-14B-Chat,10B~20B,68.25/0.44,65.87/0.37,76.02/0.72,71.51/0.59,60.44/0.20
29
+ Phi-3-medium-128k-instruct,10B~20B,64.30/0.06,63.89/0.13,66.53/0.52,64.76/0.26,62.05/0.42
30
+ Baichuan2-13B-Chat,10B~20B,62.86/0.31,64.17/0.33,58.61/0.80,61.75/0.30,67.13/0.56
31
+ Mistral-Nemo-Instruct-2407,10B~20B,59.71/0.45,61.79/0.52,51.82/0.48,58.20/0.44,67.68/0.44
32
+ Phi-3-medium-4k-instruct,10B~20B,57.79/0.45,58.69/0.37,53.88/0.62,57.02/0.55,61.74/0.55
33
+ Ziya2-13B-Chat,10B~20B,53.40/0.43,53.33/0.38,56.18/0.41,53.48/0.53,50.62/0.61
34
+ Opt-13B,10B~20B,50.18/0.26,50.29/0.20,69.97/0.37,49.94/0.47,30.22/0.31
35
+ Moonlight-16B-A3B-Instruct,10B~20B,45.16/0.43,44.16/0.64,34.79/0.67,45.82/0.33,55.62/0.35
36
+ Qwen3-14B,10B~20B,68.54/0.01,67.24/0.01,72.29/0.00,70.04/0.00,64.78/0.01
37
+ Gemma-3-12B-it,10B~20B,65.63/0.00,62.69/0.00,77.18/0.00,70.32/0.00,54.07/0.00
38
+ DeepSeek-LLM-67B-Chat,>65B,76.76/0.35,73.40/0.37,84.26/0.40,81.34/0.35,69.19/0.64
39
+ Llama3-ChatQA-1.5-70B,>65B,65.29/0.29,66.24/0.50,62.92/0.12,64.43/0.19,67.69/0.63
40
+ Qwen2.5-72B-Instruct,>65B,63.41/0.77,66.00/0.95,56.00/0.62,61.49/0.65,70.90/0.96
41
+ Qwen1.5-72B-Chat,>65B,62.91/0.50,73.86/0.84,40.46/0.97,58.75/0.35,85.55/0.62
42
+ Opt-66B,>65B,54.46/0.17,53.22/0.06,76.94/0.24,57.73/0.49,31.77/0.28
43
+ Qwen2-72B-Instruct,>65B,54.08/0.20,58.10/0.60,30.72/0.45,52.63/0.05,77.65/0.36
44
+ DeepSeek-R1-Distill-Llama-70B,>65B,52.93/0.18,59.69/0.47,19.33/0.38,51.62/0.16,86.83/0.18
45
+ Llama-3.1-70B-Instruct,>65B,52.84/0.38,59.07/1.22,19.82/0.85,51.57/0.24,86.14/0.58
46
+ Llama-3.3-70B-Instruct,>65B,50.87/0.07,54.51/0.86,13.19/0.10,50.37/0.06,88.89/0.39
47
+ Qwen3-32B,>65B,75.26/0.00,89.11/0.00,57.55/0.0,68.65/0.00,92.97/0.00
48
+ Qwen2.5-32B-Instruct,~30B,69.64/0.39,92.13/0.45,43.24/0.83,62.70/0.25,96.27/0.20
49
+ QwQ-32B-Preview,~30B,69.55/0.28,75.97/0.48,57.60/0.27,65.61/0.17,81.62/0.33
50
+ Mistral-Small-24B-Instruct-2501,~30B,64.48/0.17,64.61/0.35,64.71/0.72,64.34/0.00,64.23/1.04
51
+ Yi-1.5-34B-Chat,~30B,60.06/0.43,58.14/0.40,72.51/0.55,63.27/0.56,47.56/0.42
52
+ Opt-30B,~30B,50.88/0.11,50.76/0.12,72.95/0.16,51.18/0.26,28.62/0.28
53
+ Gemma-3-27B-it,~30B,68.50/0.00,68.37/0.00,68.84/0.00,68.62/0.00,68.15/0.00
54
+ OpenThinker2-32B,~30B,65.01/0.01,74.90/0.01,45.13/0.01,60.74/0.01,84.87/0.00