gardari commited on
Commit
7b934d3
·
verified ·
1 Parent(s): 47df215

Update scores for newer models

Browse files
Files changed (1) hide show
  1. score.py +24 -12
score.py CHANGED
@@ -6,64 +6,76 @@ import re
6
  # Model scores
7
  BENCHMARK_SCORES = {
8
  "icelandic-winogrande": {
 
 
9
  "o1-preview": 92.8,
10
  "Claude 3.5 Sonnet": 91.3,
11
  "GPT-4o": 85.4,
12
  # "GPT-4-turbo": 85.8,
13
- "Hermes 3 Llama 3.1 405B fp8": 70.6,
14
  "Claude 2.1": 55.1,
15
  "GPT-3.5-turbo": 52.0,
16
- "Deepseek V3": 75.9,
17
  },
18
  "grammatical-error-detection": {
 
 
19
  "o1-preview": 74.5,
20
  "Claude 3.5 Sonnet": 72.5,
21
  "GPT-4o": 68.0,
22
  # "GPT-4-turbo": 60.5,
23
- "Hermes 3 Llama 3.1 405B fp8": 53.5,
24
  "Claude 2.1": 52.5,
25
  "GPT-3.5-turbo": 52.0,
26
- "Deepseek V3": 57.0,
27
  },
28
  "icelandic-inflection-all": {
 
 
29
  "o1-preview": 84.4,
30
  "Claude 3.5 Sonnet": 88.8,
31
  "GPT-4o": 87.8,
32
  # "GPT-4-turbo": 76.6,
33
- "Hermes 3 Llama 3.1 405B fp8": 61.8,
34
  "Claude 2.1": 55.2,
35
  "GPT-3.5-turbo": 39.1,
36
- "Deepseek V3": 77.3,
37
  },
38
  "icelandic-belebele": {
 
 
39
  "o1-preview": 92.2,
40
  "Claude 3.5 Sonnet": 92.2,
41
  "GPT-4o": 90.4,
42
  # "GPT-4-turbo": 89.3,
43
- "Hermes 3 Llama 3.1 405B fp8": 86.1,
44
  "Claude 2.1": 42.1,
45
  "GPT-3.5-turbo": 59.2,
46
- "Deepseek V3": 87.9,
47
  },
48
  "icelandic-arc-challenge": {
 
 
49
  "o1-preview": 93.4,
50
  "Claude 3.5 Sonnet": 91.3,
51
  "GPT-4o": 90.4,
52
  # "GPT-4-turbo": 88.7,
53
- "Hermes 3 Llama 3.1 405B fp8": 72.0,
54
  "Claude 2.1": 59.9,
55
  "GPT-3.5-turbo": 49.5,
56
- "Deepseek V3": 79.7,
57
  },
58
  "icelandic-wiki-qa": {
 
 
59
  "o1-preview": 44.5,
60
  "Claude 3.5 Sonnet": 45.2,
61
  "GPT-4o": 38.0,
62
  # "GPT-4-turbo": 31.0,
63
- "Hermes 3 Llama 3.1 405B fp8": 33.8,
64
  "Claude 2.1": 21.1,
65
  "GPT-3.5-turbo": 15.0,
66
- "Deepseek V3": 27.2,
67
  },
68
  }
69
 
 
6
  # Model scores
7
  BENCHMARK_SCORES = {
8
  "icelandic-winogrande": {
9
+ "GPT-5": 94.39,
10
+ "Gemini Pro 2.5": 92.83,
11
  "o1-preview": 92.8,
12
  "Claude 3.5 Sonnet": 91.3,
13
  "GPT-4o": 85.4,
14
  # "GPT-4-turbo": 85.8,
15
+ "Hermes 3 Llama 3.1 405B": 70.6,
16
  "Claude 2.1": 55.1,
17
  "GPT-3.5-turbo": 52.0,
18
+ "Deepseek R1": 81.43,
19
  },
20
  "grammatical-error-detection": {
21
+ "GPT-5": 71.50,
22
+ "Gemini Pro 2.5": 71.50,
23
  "o1-preview": 74.5,
24
  "Claude 3.5 Sonnet": 72.5,
25
  "GPT-4o": 68.0,
26
  # "GPT-4-turbo": 60.5,
27
+ "Hermes 3 Llama 3.1 405B": 53.5,
28
  "Claude 2.1": 52.5,
29
  "GPT-3.5-turbo": 52.0,
30
+ "Deepseek R1": 59.0,
31
  },
32
  "icelandic-inflection-all": {
33
+ "Gemini Pro 2.5": 90.79,
34
+ "GPT-5": 95.21,
35
  "o1-preview": 84.4,
36
  "Claude 3.5 Sonnet": 88.8,
37
  "GPT-4o": 87.8,
38
  # "GPT-4-turbo": 76.6,
39
+ "Hermes 3 Llama 3.1 405B": 61.8,
40
  "Claude 2.1": 55.2,
41
  "GPT-3.5-turbo": 39.1,
42
+ "Deepseek R1": 68.50,
43
  },
44
  "icelandic-belebele": {
45
+ "Gemini Pro 2.5": 93.89,
46
+ "GPT-5": 93.11,
47
  "o1-preview": 92.2,
48
  "Claude 3.5 Sonnet": 92.2,
49
  "GPT-4o": 90.4,
50
  # "GPT-4-turbo": 89.3,
51
+ "Hermes 3 Llama 3.1 405B": 86.1,
52
  "Claude 2.1": 42.1,
53
  "GPT-3.5-turbo": 59.2,
54
+ "Deepseek R1": 90.33,
55
  },
56
  "icelandic-arc-challenge": {
57
+ "Gemini Pro 2.5": 93.94,
58
+ "GPT-5": 94.37,
59
  "o1-preview": 93.4,
60
  "Claude 3.5 Sonnet": 91.3,
61
  "GPT-4o": 90.4,
62
  # "GPT-4-turbo": 88.7,
63
+ "Hermes 3 Llama 3.1 405B": 72.0,
64
  "Claude 2.1": 59.9,
65
  "GPT-3.5-turbo": 49.5,
66
+ "Deepseek R1": 90.61,
67
  },
68
  "icelandic-wiki-qa": {
69
+ "Gemini Pro 2.5": 52.66,
70
+ "GPT-5": 45.79,
71
  "o1-preview": 44.5,
72
  "Claude 3.5 Sonnet": 45.2,
73
  "GPT-4o": 38.0,
74
  # "GPT-4-turbo": 31.0,
75
+ "Hermes 3 Llama 3.1 405B": 33.8,
76
  "Claude 2.1": 21.1,
77
  "GPT-3.5-turbo": 15.0,
78
+ "Deepseek R1": 29.24,
79
  },
80
  }
81