mokamoto commited on
Commit
b61db1b
·
1 Parent(s): 1f3572e

model tooltips on all pages

Browse files
causal_analysis_table.html CHANGED
@@ -31,7 +31,7 @@
31
  </thead>
32
  <tbody>
33
  <tr>
34
- <td>Llama 3 70B Instruct</td>
35
  <td class="has-text-centered">0.148</td>
36
  <td class="has-text-centered">0.429</td>
37
  <td class="has-text-centered">0.148</td>
@@ -42,7 +42,7 @@
42
  <td class="has-text-centered">0.198</td>
43
  </tr>
44
  <tr>
45
- <td>Llama 3 8B Instruct</td>
46
  <td class="has-text-centered">0.097</td>
47
  <td class="has-text-centered">0.341</td>
48
  <td class="has-text-centered">0.097</td>
@@ -53,7 +53,7 @@
53
  <td class="has-text-centered performance-strong">0.380</td>
54
  </tr>
55
  <tr>
56
- <td>DBRX Instruct</td>
57
  <td class="has-text-centered">0.078</td>
58
  <td class="has-text-centered">0.521</td>
59
  <td class="has-text-centered">0.078</td>
@@ -64,7 +64,7 @@
64
  <td class="has-text-centered">0.235</td>
65
  </tr>
66
  <tr>
67
- <td>DeepSeek LLM (67B)</td>
68
  <td class="has-text-centered">0.026</td>
69
  <td class="has-text-centered">0.214</td>
70
  <td class="has-text-centered">0.026</td>
@@ -75,7 +75,7 @@
75
  <td class="has-text-centered">0.221</td>
76
  </tr>
77
  <tr>
78
- <td>Gemma 2 27B</td>
79
  <td class="has-text-centered">0.115</td>
80
  <td class="has-text-centered">0.510</td>
81
  <td class="has-text-centered">0.115</td>
@@ -86,7 +86,7 @@
86
  <td class="has-text-centered">0.262</td>
87
  </tr>
88
  <tr>
89
- <td>Gemma 2 9B</td>
90
  <td class="has-text-centered">0.115</td>
91
  <td class="has-text-centered">0.394</td>
92
  <td class="has-text-centered">0.115</td>
@@ -97,7 +97,7 @@
97
  <td class="has-text-centered">0.258</td>
98
  </tr>
99
  <tr>
100
- <td>Mistral (7B) Instruct v0.3</td>
101
  <td class="has-text-centered">0.078</td>
102
  <td class="has-text-centered">0.455</td>
103
  <td class="has-text-centered">0.078</td>
@@ -108,7 +108,7 @@
108
  <td class="has-text-centered">0.258</td>
109
  </tr>
110
  <tr>
111
- <td>Mixtral-8x22B Instruct</td>
112
  <td class="has-text-centered">0.131</td>
113
  <td class="has-text-centered">0.486</td>
114
  <td class="has-text-centered">0.131</td>
@@ -119,7 +119,7 @@
119
  <td class="has-text-centered performance-medium">0.318</td>
120
  </tr>
121
  <tr>
122
- <td>Mixtral-8x7B Instruct</td>
123
  <td class="has-text-centered">0.088</td>
124
  <td class="has-text-centered">0.510</td>
125
  <td class="has-text-centered">0.088</td>
@@ -130,7 +130,7 @@
130
  <td class="has-text-centered">0.273</td>
131
  </tr>
132
  <tr>
133
- <td>Qwen 2 Instruct (72B)</td>
134
  <td class="has-text-centered">0.139</td>
135
  <td class="has-text-centered">0.489</td>
136
  <td class="has-text-centered">0.139</td>
@@ -141,7 +141,7 @@
141
  <td class="has-text-centered">0.188</td>
142
  </tr>
143
  <tr>
144
- <td>WizardLM-2 8x22B</td>
145
  <td class="has-text-centered">0.076</td>
146
  <td class="has-text-centered">0.453</td>
147
  <td class="has-text-centered">0.076</td>
@@ -152,7 +152,7 @@
152
  <td class="has-text-centered">0.237</td>
153
  </tr>
154
  <tr>
155
- <td>DeepSeek-V3</td>
156
  <td class="has-text-centered">0.164</td>
157
  <td class="has-text-centered">0.528</td>
158
  <td class="has-text-centered">0.164</td>
@@ -163,7 +163,7 @@
163
  <td class="has-text-centered">0.248</td>
164
  </tr>
165
  <tr>
166
- <td>DeepSeek R1</td>
167
  <td class="has-text-centered performance-best">0.245</td>
168
  <td class="has-text-centered performance-strong">0.643</td>
169
  <td class="has-text-centered performance-best">0.245</td>
@@ -174,7 +174,7 @@
174
  <td class="has-text-centered">0.221</td>
175
  </tr>
176
  <tr>
177
- <td>QwQ-32B-Preview</td>
178
  <td class="has-text-centered">0.110</td>
179
  <td class="has-text-centered">0.473</td>
180
  <td class="has-text-centered">0.110</td>
@@ -185,7 +185,7 @@
185
  <td class="has-text-centered performance-best">0.465</td>
186
  </tr>
187
  <tr>
188
- <td>Jamba 1.5 Mini</td>
189
  <td class="has-text-centered">0.050</td>
190
  <td class="has-text-centered">0.280</td>
191
  <td class="has-text-centered">0.050</td>
@@ -196,7 +196,7 @@
196
  <td class="has-text-centered">0.295</td>
197
  </tr>
198
  <tr>
199
- <td>Jamba 1.5 Large</td>
200
  <td class="has-text-centered">0.076</td>
201
  <td class="has-text-centered">0.517</td>
202
  <td class="has-text-centered">0.076</td>
@@ -207,7 +207,7 @@
207
  <td class="has-text-centered">0.200</td>
208
  </tr>
209
  <tr>
210
- <td>Claude 3.5 Sonnet</td>
211
  <td class="has-text-centered">0.154</td>
212
  <td class="has-text-centered">0.564</td>
213
  <td class="has-text-centered">0.154</td>
@@ -218,7 +218,7 @@
218
  <td class="has-text-centered">0.235</td>
219
  </tr>
220
  <tr>
221
- <td>Claude 3 Haiku</td>
222
  <td class="has-text-centered">0.082</td>
223
  <td class="has-text-centered">0.388</td>
224
  <td class="has-text-centered">0.082</td>
@@ -229,7 +229,7 @@
229
  <td class="has-text-centered">0.203</td>
230
  </tr>
231
  <tr>
232
- <td>Cohere Command R 7B</td>
233
  <td class="has-text-centered">0.089</td>
234
  <td class="has-text-centered">0.363</td>
235
  <td class="has-text-centered">0.089</td>
@@ -240,7 +240,7 @@
240
  <td class="has-text-centered">0.275</td>
241
  </tr>
242
  <tr>
243
- <td>Cohere Command R +</td>
244
  <td class="has-text-centered">0.090</td>
245
  <td class="has-text-centered">0.453</td>
246
  <td class="has-text-centered">0.090</td>
@@ -251,7 +251,7 @@
251
  <td class="has-text-centered">0.265</td>
252
  </tr>
253
  <tr>
254
- <td>Google Gemini 1.5 Pro</td>
255
  <td class="has-text-centered performance-medium">0.165</td>
256
  <td class="has-text-centered">0.514</td>
257
  <td class="has-text-centered performance-medium">0.165</td>
@@ -262,7 +262,7 @@
262
  <td class="has-text-centered">0.258</td>
263
  </tr>
264
  <tr>
265
- <td>OpenAI gpt-4o</td>
266
  <td class="has-text-centered">0.082</td>
267
  <td class="has-text-centered performance-medium">0.576</td>
268
  <td class="has-text-centered">0.082</td>
@@ -273,7 +273,7 @@
273
  <td class="has-text-centered">0.235</td>
274
  </tr>
275
  <tr>
276
- <td>OpenAI o1-mini</td>
277
  <td class="has-text-centered performance-strong">0.206</td>
278
  <td class="has-text-centered performance-best">0.648</td>
279
  <td class="has-text-centered performance-strong">0.206</td>
 
31
  </thead>
32
  <tbody>
33
  <tr>
34
+ <td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
35
  <td class="has-text-centered">0.148</td>
36
  <td class="has-text-centered">0.429</td>
37
  <td class="has-text-centered">0.148</td>
 
42
  <td class="has-text-centered">0.198</td>
43
  </tr>
44
  <tr>
45
+ <td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
46
  <td class="has-text-centered">0.097</td>
47
  <td class="has-text-centered">0.341</td>
48
  <td class="has-text-centered">0.097</td>
 
53
  <td class="has-text-centered performance-strong">0.380</td>
54
  </tr>
55
  <tr>
56
+ <td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
57
  <td class="has-text-centered">0.078</td>
58
  <td class="has-text-centered">0.521</td>
59
  <td class="has-text-centered">0.078</td>
 
64
  <td class="has-text-centered">0.235</td>
65
  </tr>
66
  <tr>
67
+ <td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
68
  <td class="has-text-centered">0.026</td>
69
  <td class="has-text-centered">0.214</td>
70
  <td class="has-text-centered">0.026</td>
 
75
  <td class="has-text-centered">0.221</td>
76
  </tr>
77
  <tr>
78
+ <td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
79
  <td class="has-text-centered">0.115</td>
80
  <td class="has-text-centered">0.510</td>
81
  <td class="has-text-centered">0.115</td>
 
86
  <td class="has-text-centered">0.262</td>
87
  </tr>
88
  <tr>
89
+ <td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
90
  <td class="has-text-centered">0.115</td>
91
  <td class="has-text-centered">0.394</td>
92
  <td class="has-text-centered">0.115</td>
 
97
  <td class="has-text-centered">0.258</td>
98
  </tr>
99
  <tr>
100
+ <td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
101
  <td class="has-text-centered">0.078</td>
102
  <td class="has-text-centered">0.455</td>
103
  <td class="has-text-centered">0.078</td>
 
108
  <td class="has-text-centered">0.258</td>
109
  </tr>
110
  <tr>
111
+ <td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
112
  <td class="has-text-centered">0.131</td>
113
  <td class="has-text-centered">0.486</td>
114
  <td class="has-text-centered">0.131</td>
 
119
  <td class="has-text-centered performance-medium">0.318</td>
120
  </tr>
121
  <tr>
122
+ <td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
123
  <td class="has-text-centered">0.088</td>
124
  <td class="has-text-centered">0.510</td>
125
  <td class="has-text-centered">0.088</td>
 
130
  <td class="has-text-centered">0.273</td>
131
  </tr>
132
  <tr>
133
+ <td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
134
  <td class="has-text-centered">0.139</td>
135
  <td class="has-text-centered">0.489</td>
136
  <td class="has-text-centered">0.139</td>
 
141
  <td class="has-text-centered">0.188</td>
142
  </tr>
143
  <tr>
144
+ <td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
145
  <td class="has-text-centered">0.076</td>
146
  <td class="has-text-centered">0.453</td>
147
  <td class="has-text-centered">0.076</td>
 
152
  <td class="has-text-centered">0.237</td>
153
  </tr>
154
  <tr>
155
+ <td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
156
  <td class="has-text-centered">0.164</td>
157
  <td class="has-text-centered">0.528</td>
158
  <td class="has-text-centered">0.164</td>
 
163
  <td class="has-text-centered">0.248</td>
164
  </tr>
165
  <tr>
166
+ <td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
167
  <td class="has-text-centered performance-best">0.245</td>
168
  <td class="has-text-centered performance-strong">0.643</td>
169
  <td class="has-text-centered performance-best">0.245</td>
 
174
  <td class="has-text-centered">0.221</td>
175
  </tr>
176
  <tr>
177
+ <td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
178
  <td class="has-text-centered">0.110</td>
179
  <td class="has-text-centered">0.473</td>
180
  <td class="has-text-centered">0.110</td>
 
185
  <td class="has-text-centered performance-best">0.465</td>
186
  </tr>
187
  <tr>
188
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
189
  <td class="has-text-centered">0.050</td>
190
  <td class="has-text-centered">0.280</td>
191
  <td class="has-text-centered">0.050</td>
 
196
  <td class="has-text-centered">0.295</td>
197
  </tr>
198
  <tr>
199
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
200
  <td class="has-text-centered">0.076</td>
201
  <td class="has-text-centered">0.517</td>
202
  <td class="has-text-centered">0.076</td>
 
207
  <td class="has-text-centered">0.200</td>
208
  </tr>
209
  <tr>
210
+ <td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
211
  <td class="has-text-centered">0.154</td>
212
  <td class="has-text-centered">0.564</td>
213
  <td class="has-text-centered">0.154</td>
 
218
  <td class="has-text-centered">0.235</td>
219
  </tr>
220
  <tr>
221
+ <td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
222
  <td class="has-text-centered">0.082</td>
223
  <td class="has-text-centered">0.388</td>
224
  <td class="has-text-centered">0.082</td>
 
229
  <td class="has-text-centered">0.203</td>
230
  </tr>
231
  <tr>
232
+ <td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
233
  <td class="has-text-centered">0.089</td>
234
  <td class="has-text-centered">0.363</td>
235
  <td class="has-text-centered">0.089</td>
 
240
  <td class="has-text-centered">0.275</td>
241
  </tr>
242
  <tr>
243
+ <td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
244
  <td class="has-text-centered">0.090</td>
245
  <td class="has-text-centered">0.453</td>
246
  <td class="has-text-centered">0.090</td>
 
251
  <td class="has-text-centered">0.265</td>
252
  </tr>
253
  <tr>
254
+ <td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
255
  <td class="has-text-centered performance-medium">0.165</td>
256
  <td class="has-text-centered">0.514</td>
257
  <td class="has-text-centered performance-medium">0.165</td>
 
262
  <td class="has-text-centered">0.258</td>
263
  </tr>
264
  <tr>
265
+ <td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
266
  <td class="has-text-centered">0.082</td>
267
  <td class="has-text-centered performance-medium">0.576</td>
268
  <td class="has-text-centered">0.082</td>
 
273
  <td class="has-text-centered">0.235</td>
274
  </tr>
275
  <tr>
276
+ <td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
277
  <td class="has-text-centered performance-strong">0.206</td>
278
  <td class="has-text-centered performance-best">0.648</td>
279
  <td class="has-text-centered performance-strong">0.206</td>
fix_tooltips.sh CHANGED
@@ -1,7 +1,47 @@
1
  #!/bin/bash
2
 
3
- # Script to fix tooltips in all HTML files
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  # Fix tooltips in information_retrieval_table.html
6
  sed -i 's/tooltip-trigger" data-tooltip="A dataset for information retrieval in the financial domain/tooltip-trigger tooltip-right" data-tooltip="A dataset for information retrieval in the financial domain/g' information_retrieval_table.html
7
 
@@ -17,4 +57,162 @@ sed -i 's/tooltip-trigger tooltip-right" data-tooltip="Manually-annotated datase
17
  # Fix tooltips in text_summarization_table.html (in case the tooltip-right class isn't working)
18
  sed -i 's/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/g' text_summarization_table.html
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  echo "Fixed tooltips in all HTML files"
 
1
  #!/bin/bash
2
 
3
+ # Script to add model tooltips and fix existing tooltips in all HTML files
4
 
5
+ # Model tooltip definitions - exact descriptions from cost analysis tab
6
+ declare -A model_tooltips
7
+ model_tooltips["OpenAI gpt-4o"]="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following."
8
+ model_tooltips["GPT-4o"]="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following."
9
+ model_tooltips["OpenAI o1-mini"]="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count."
10
+ model_tooltips["o1-mini"]="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count."
11
+ model_tooltips["Claude 3.5 Sonnet"]="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities."
12
+ model_tooltips["Claude 3 Haiku"]="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks."
13
+ model_tooltips["Google Gemini 1.5 Pro"]="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities."
14
+ model_tooltips["Gemini 1.5 Pro"]="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities."
15
+ model_tooltips["Cohere Command R 7B"]="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size."
16
+ model_tooltips["Cohere Command R +"]="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart."
17
+ model_tooltips["DeepSeek R1"]="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks."
18
+ model_tooltips["DeepSeek-V3"]="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities."
19
+ model_tooltips["DeepSeek LLM (67B)"]="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities."
20
+ model_tooltips["Llama 3 70B Instruct"]="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities."
21
+ model_tooltips["Llama 3 8B Instruct"]="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities."
22
+ model_tooltips["DBRX Instruct"]="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities."
23
+ model_tooltips["Mixtral-8x22B Instruct"]="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance."
24
+ model_tooltips["Mixtral-8x7B Instruct"]="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities."
25
+ model_tooltips["Mistral (7B) Instruct v0.3"]="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size."
26
+ model_tooltips["Qwen 2 Instruct (72B)"]="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities."
27
+ model_tooltips["WizardLM-2 8x22B"]="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks."
28
+ model_tooltips["Gemma 2 27B"]="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following."
29
+ model_tooltips["Gemma 2 9B"]="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size."
30
+ model_tooltips["QwQ-32B-Preview"]="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks."
31
+ model_tooltips["Jamba 1.5 Mini"]="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks."
32
+ model_tooltips["Jamba 1.5 Large"]="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart."
33
+
34
+ # Files to process
35
+ files=(
36
+ "text_classification_table.html"
37
+ "sentiment_analysis_table.html"
38
+ "information_retrieval_table.html"
39
+ "causal_analysis_table.html"
40
+ "text_summarization_table.html"
41
+ "qa_table.html"
42
+ )
43
+
44
+ # Fix existing dataset tooltips
45
  # Fix tooltips in information_retrieval_table.html
46
  sed -i 's/tooltip-trigger" data-tooltip="A dataset for information retrieval in the financial domain/tooltip-trigger tooltip-right" data-tooltip="A dataset for information retrieval in the financial domain/g' information_retrieval_table.html
47
 
 
57
  # Fix tooltips in text_summarization_table.html (in case the tooltip-right class isn't working)
58
  sed -i 's/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/g' text_summarization_table.html
59
 
60
+ # Add or update model tooltips to each file
61
+ for file in "${files[@]}"; do
62
+ echo "Processing $file..."
63
+
64
+ # For each model in our list
65
+ for model in "${!model_tooltips[@]}"; do
66
+ # Convert model name to a sed-safe string by escaping special characters
67
+ model_sed_safe=$(echo "$model" | sed 's/[\/&]/\\&/g')
68
+ tooltip_sed_safe=$(echo "${model_tooltips[$model]}" | sed 's/[\/&]/\\&/g')
69
+
70
+ # First, update existing tooltips if they exist
71
+ sed -i "s/data-title=\"$model_sed_safe\" data-tooltip=\"[^\"]*\"/data-title=\"$model_sed_safe\" data-tooltip=\"$tooltip_sed_safe\"/g" "$file"
72
+
73
+ # Then, add tooltips to plain model names without tooltips
74
+ sed -i "s/<td>$model_sed_safe<\/td>/<td class=\"tooltip-trigger tooltip-right\" data-title=\"$model_sed_safe\" data-tooltip=\"$tooltip_sed_safe\">$model_sed_safe<\/td>/g" "$file"
75
+ done
76
+
77
+ # Ensure tooltip script is included at the bottom of the file
78
+ if ! grep -q "tooltips.js" "$file"; then
79
+ echo "<script src=\"static/js/tooltips.js\"></script>" >> "$file"
80
+ fi
81
+
82
+ if ! grep -q "fixed-tooltips.js" "$file"; then
83
+ echo "<script src=\"static/js/fixed-tooltips.js\"></script>" >> "$file"
84
+ fi
85
+
86
+ # Add tooltips.css if not already included
87
+ if ! grep -q "tooltips.css" "$file"; then
88
+ sed -i '1i<link rel="stylesheet" href="static/css/tooltips.css">' "$file"
89
+ fi
90
+ done
91
+
92
+ # Also update results.html to ensure proper tooltip handling
93
+ echo "Adding tooltip fix to results.html..."
94
+
95
+ # Copy the model tooltip fixing code for all tabs to a new JS file
96
+ cat > static/js/model-tooltips.js << EOF
97
+ document.addEventListener('DOMContentLoaded', function() {
98
+ // Fix model tooltips in all tabs
99
+ function fixAllModelTooltips() {
100
+ console.log("Fixing model tooltips in all tabs");
101
+
102
+ // Find all model name cells (first column in all tables)
103
+ const modelCells = document.querySelectorAll('td:first-child');
104
+
105
+ // Process each model cell
106
+ modelCells.forEach(cell => {
107
+ // Skip cells that already have tooltips
108
+ if (cell.classList.contains('tooltip-trigger')) {
109
+ return;
110
+ }
111
+
112
+ // Get the model name
113
+ const modelName = cell.textContent.trim();
114
+
115
+ // Add tooltip-trigger class and position style
116
+ cell.classList.add('tooltip-trigger');
117
+ cell.style.position = 'relative';
118
+
119
+ // Add data-title attribute with the model name
120
+ cell.setAttribute('data-title', modelName);
121
+
122
+ // Add descriptive tooltip based on model
123
+ let tooltipText = "";
124
+
125
+ // Set descriptive tooltip based on model name
126
+ if (modelName.includes("GPT-4o")) {
127
+ tooltipText = "OpenAI's advanced proprietary closed-source model. One of the top performers across most tasks.";
128
+ } else if (modelName.includes("o1-mini")) {
129
+ tooltipText = "Compact proprietary model from OpenAI. Shows strong performance on causal analysis tasks.";
130
+ } else if (modelName.includes("Claude 3.5 Sonnet")) {
131
+ tooltipText = "Anthropic's model optimized for advanced reasoning. Strong performer on text classification and summarization.";
132
+ } else if (modelName.includes("Claude 3 Haiku")) {
133
+ tooltipText = "Anthropic's smaller, efficiency-focused model in the Claude series.";
134
+ } else if (modelName.includes("Gemini 1.5")) {
135
+ tooltipText = "Google's highly capable proprietary model.";
136
+ } else if (modelName.includes("Command R 7B")) {
137
+ tooltipText = "A 7-billion parameter model from Cohere focused on instruction-following.";
138
+ } else if (modelName.includes("Command R +")) {
139
+ tooltipText = "An improved version of Cohere's Command R model.";
140
+ } else if (modelName.includes("DeepSeek R1")) {
141
+ tooltipText = "Open-weight model from DeepSeek AI with 671B parameters (MoE architecture). One of the top performers in the benchmark.";
142
+ } else if (modelName.includes("DeepSeek-V3") || modelName.includes("DeepSeek V3")) {
143
+ tooltipText = "Open-weight model from DeepSeek AI with 685B parameters (MoE architecture).";
144
+ } else if (modelName.includes("DeepSeek LLM")) {
145
+ tooltipText = "A 67-billion parameter chat-optimized model from DeepSeek AI.";
146
+ } else if (modelName.includes("Llama 3 70B")) {
147
+ tooltipText = "Meta's 70-billion parameter dense model, optimized for instruction-following tasks.";
148
+ } else if (modelName.includes("Llama 3 8B")) {
149
+ tooltipText = "Meta's 8-billion parameter efficient model variant.";
150
+ } else if (modelName.includes("DBRX")) {
151
+ tooltipText = "Databricks' 132B parameter MoE model.";
152
+ } else if (modelName.includes("Mixtral-8x22B")) {
153
+ tooltipText = "141B parameter MoE model from Mistral AI with eight 22-billion parameter sub-models.";
154
+ } else if (modelName.includes("Mixtral-8x7B")) {
155
+ tooltipText = "46.7B parameter MoE model from Mistral AI with eight 7-billion parameter sub-models.";
156
+ } else if (modelName.includes("Mistral")) {
157
+ tooltipText = "A 7-billion parameter instruction-tuned model from Mistral AI.";
158
+ } else if (modelName.includes("Qwen 2")) {
159
+ tooltipText = "Alibaba's 72-billion parameter instruction-following model.";
160
+ } else if (modelName.includes("WizardLM")) {
161
+ tooltipText = "A 176B parameter MoE model focused on complex reasoning.";
162
+ } else if (modelName.includes("Gemma 2 27B")) {
163
+ tooltipText = "Google's open-weight 27B parameter model.";
164
+ } else if (modelName.includes("Gemma 2 9B")) {
165
+ tooltipText = "Google's open-weight 9B parameter efficient model.";
166
+ } else if (modelName.includes("QwQ-32B")) {
167
+ tooltipText = "Qwen's experimental MoE model with 32B parameters.";
168
+ } else if (modelName.includes("Jamba 1.5 Mini")) {
169
+ tooltipText = "A compact variant of the Jamba model series.";
170
+ } else if (modelName.includes("Jamba 1.5 Large")) {
171
+ tooltipText = "An expanded variant of the Jamba model series.";
172
+ } else {
173
+ tooltipText = "A large language model from the FLaME evaluation benchmark.";
174
+ }
175
+
176
+ // Set the tooltip
177
+ cell.setAttribute('data-tooltip', tooltipText);
178
+ });
179
+
180
+ // After adding attributes, run the tooltip fix
181
+ if (window.fixProblemTooltips) {
182
+ window.fixProblemTooltips();
183
+ }
184
+ }
185
+
186
+ // Run on page load
187
+ setTimeout(fixAllModelTooltips, 500);
188
+
189
+ // Run when tabs are clicked
190
+ const tabs = document.querySelectorAll('.tabs li');
191
+ tabs.forEach(tab => {
192
+ tab.addEventListener('click', () => {
193
+ // Give time for content to be displayed
194
+ setTimeout(fixAllModelTooltips, 200);
195
+ });
196
+ });
197
+ });
198
+ EOF
199
+
200
+ # Add script inclusion to results.html if not already there
201
+ if ! grep -q "model-tooltips.js" "results.html"; then
202
+ # Add the script link before the closing body tag
203
+ sed -i 's/<\/body>/<script src="static\/js\/model-tooltips.js"><\/script>\n<\/body>/g' "results.html"
204
+ fi
205
+
206
+ # Add tooltip fix to ensure all tabs initialize properly
207
+ if ! grep -q "window.fixProblemTooltips" "results.html"; then
208
+ # Add call to fix all tooltips when tabs are clicked
209
+ sed -i '/document\.addEventListener.*DOMContentLoaded/a \
210
+ // Fix all tooltips in all tabs\
211
+ setTimeout(function() {\
212
+ if (window.fixProblemTooltips) {\
213
+ window.fixProblemTooltips();\
214
+ }\
215
+ }, 500);' "results.html"
216
+ fi
217
+
218
  echo "Fixed tooltips in all HTML files"
information_retrieval_table.html CHANGED
@@ -46,7 +46,7 @@
46
  </thead>
47
  <tbody>
48
  <tr>
49
- <td>Llama 3 70B Instruct</td>
50
  <td class="has-text-centered">0.715</td>
51
  <td class="has-text-centered">0.693</td>
52
  <td class="has-text-centered">0.701</td>
@@ -69,7 +69,7 @@
69
  <td class="has-text-centered">0.469</td>
70
  </tr>
71
  <tr>
72
- <td>Llama 3 8B Instruct</td>
73
  <td class="has-text-centered">0.581</td>
74
  <td class="has-text-centered">0.558</td>
75
  <td class="has-text-centered">0.565</td>
@@ -92,7 +92,7 @@
92
  <td class="has-text-centered">0.350</td>
93
  </tr>
94
  <tr>
95
- <td>DBRX Instruct</td>
96
  <td class="has-text-centered">0.516</td>
97
  <td class="has-text-centered">0.476</td>
98
  <td class="has-text-centered">0.489</td>
@@ -115,7 +115,7 @@
115
  <td class="has-text-centered">0.006</td>
116
  </tr>
117
  <tr>
118
- <td>DeepSeek LLM (67B)</td>
119
  <td class="has-text-centered">0.752</td>
120
  <td class="has-text-centered">0.742</td>
121
  <td class="has-text-centered">0.745</td>
@@ -138,7 +138,7 @@
138
  <td class="has-text-centered">0.416</td>
139
  </tr>
140
  <tr>
141
- <td>Gemma 2 27B</td>
142
  <td class="has-text-centered">0.772</td>
143
  <td class="has-text-centered">0.754</td>
144
  <td class="has-text-centered">0.761</td>
@@ -161,7 +161,7 @@
161
  <td class="has-text-centered">0.298</td>
162
  </tr>
163
  <tr>
164
- <td>Gemma 2 9B</td>
165
  <td class="has-text-centered">0.665</td>
166
  <td class="has-text-centered">0.643</td>
167
  <td class="has-text-centered">0.651</td>
@@ -184,7 +184,7 @@
184
  <td class="has-text-centered">0.367</td>
185
  </tr>
186
  <tr>
187
- <td>Mistral (7B) Instruct v0.3</td>
188
  <td class="has-text-centered">0.540</td>
189
  <td class="has-text-centered">0.522</td>
190
  <td class="has-text-centered">0.526</td>
@@ -207,7 +207,7 @@
207
  <td class="has-text-centered">0.368</td>
208
  </tr>
209
  <tr>
210
- <td>Mixtral-8x22B Instruct</td>
211
  <td class="has-text-centered">0.653</td>
212
  <td class="has-text-centered">0.625</td>
213
  <td class="has-text-centered">0.635</td>
@@ -230,7 +230,7 @@
230
  <td class="has-text-centered">0.435</td>
231
  </tr>
232
  <tr>
233
- <td>Mixtral-8x7B Instruct</td>
234
  <td class="has-text-centered">0.613</td>
235
  <td class="has-text-centered">0.591</td>
236
  <td class="has-text-centered">0.598</td>
@@ -253,7 +253,7 @@
253
  <td class="has-text-centered">0.267</td>
254
  </tr>
255
  <tr>
256
- <td>Qwen 2 Instruct (72B)</td>
257
  <td class="has-text-centered">0.766</td>
258
  <td class="has-text-centered">0.742</td>
259
  <td class="has-text-centered">0.748</td>
@@ -276,7 +276,7 @@
276
  <td class="has-text-centered">0.483</td>
277
  </tr>
278
  <tr>
279
- <td>WizardLM-2 8x22B</td>
280
  <td class="has-text-centered">0.755</td>
281
  <td class="has-text-centered">0.741</td>
282
  <td class="has-text-centered">0.744</td>
@@ -299,7 +299,7 @@
299
  <td class="has-text-centered">0.226</td>
300
  </tr>
301
  <tr>
302
- <td>DeepSeek-V3</td>
303
  <td class="has-text-centered performance-medium">0.798</td>
304
  <td class="has-text-centered performance-medium">0.787</td>
305
  <td class="has-text-centered performance-medium">0.790</td>
@@ -322,7 +322,7 @@
322
  <td class="has-text-centered">0.549</td>
323
  </tr>
324
  <tr>
325
- <td>DeepSeek R1</td>
326
  <td class="has-text-centered performance-best">0.813</td>
327
  <td class="has-text-centered performance-best">0.805</td>
328
  <td class="has-text-centered performance-best">0.807</td>
@@ -345,7 +345,7 @@
345
  <td class="has-text-centered performance-medium">0.587</td>
346
  </tr>
347
  <tr>
348
- <td>QwQ-32B-Preview</td>
349
  <td class="has-text-centered">0.695</td>
350
  <td class="has-text-centered">0.681</td>
351
  <td class="has-text-centered">0.685</td>
@@ -368,7 +368,7 @@
368
  <td class="has-text-centered">0.005</td>
369
  </tr>
370
  <tr>
371
- <td>Jamba 1.5 Mini</td>
372
  <td class="has-text-centered">0.564</td>
373
  <td class="has-text-centered">0.556</td>
374
  <td class="has-text-centered">0.552</td>
@@ -391,7 +391,7 @@
391
  <td class="has-text-centered">0.132</td>
392
  </tr>
393
  <tr>
394
- <td>Jamba 1.5 Large</td>
395
  <td class="has-text-centered">0.707</td>
396
  <td class="has-text-centered">0.687</td>
397
  <td class="has-text-centered">0.693</td>
@@ -414,7 +414,7 @@
414
  <td class="has-text-centered">0.397</td>
415
  </tr>
416
  <tr>
417
- <td>Claude 3.5 Sonnet</td>
418
  <td class="has-text-centered performance-strong">0.811</td>
419
  <td class="has-text-centered performance-strong">0.794</td>
420
  <td class="has-text-centered performance-strong">0.799</td>
@@ -437,7 +437,7 @@
437
  <td class="has-text-centered performance-strong">0.655</td>
438
  </tr>
439
  <tr>
440
- <td>Claude 3 Haiku</td>
441
  <td class="has-text-centered">0.732</td>
442
  <td class="has-text-centered">0.700</td>
443
  <td class="has-text-centered">0.711</td>
@@ -460,7 +460,7 @@
460
  <td class="has-text-centered">0.494</td>
461
  </tr>
462
  <tr>
463
- <td>Cohere Command R +</td>
464
  <td class="has-text-centered">0.769</td>
465
  <td class="has-text-centered">0.750</td>
466
  <td class="has-text-centered">0.756</td>
@@ -483,7 +483,7 @@
483
  <td class="has-text-centered">0.452</td>
484
  </tr>
485
  <tr>
486
- <td>Google Gemini 1.5 Pro</td>
487
  <td class="has-text-centered">0.728</td>
488
  <td class="has-text-centered">0.705</td>
489
  <td class="has-text-centered">0.712</td>
@@ -506,7 +506,7 @@
506
  <td class="has-text-centered">0.393</td>
507
  </tr>
508
  <tr>
509
- <td>OpenAI gpt-4o</td>
510
  <td class="has-text-centered">0.778</td>
511
  <td class="has-text-centered">0.760</td>
512
  <td class="has-text-centered">0.766</td>
@@ -529,7 +529,7 @@
529
  <td class="has-text-centered">0.523</td>
530
  </tr>
531
  <tr>
532
- <td>OpenAI o1-mini</td>
533
  <td class="has-text-centered">0.772</td>
534
  <td class="has-text-centered">0.755</td>
535
  <td class="has-text-centered">0.761</td>
 
46
  </thead>
47
  <tbody>
48
  <tr>
49
+ <td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
50
  <td class="has-text-centered">0.715</td>
51
  <td class="has-text-centered">0.693</td>
52
  <td class="has-text-centered">0.701</td>
 
69
  <td class="has-text-centered">0.469</td>
70
  </tr>
71
  <tr>
72
+ <td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
73
  <td class="has-text-centered">0.581</td>
74
  <td class="has-text-centered">0.558</td>
75
  <td class="has-text-centered">0.565</td>
 
92
  <td class="has-text-centered">0.350</td>
93
  </tr>
94
  <tr>
95
+ <td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
96
  <td class="has-text-centered">0.516</td>
97
  <td class="has-text-centered">0.476</td>
98
  <td class="has-text-centered">0.489</td>
 
115
  <td class="has-text-centered">0.006</td>
116
  </tr>
117
  <tr>
118
+ <td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
119
  <td class="has-text-centered">0.752</td>
120
  <td class="has-text-centered">0.742</td>
121
  <td class="has-text-centered">0.745</td>
 
138
  <td class="has-text-centered">0.416</td>
139
  </tr>
140
  <tr>
141
+ <td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
142
  <td class="has-text-centered">0.772</td>
143
  <td class="has-text-centered">0.754</td>
144
  <td class="has-text-centered">0.761</td>
 
161
  <td class="has-text-centered">0.298</td>
162
  </tr>
163
  <tr>
164
+ <td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
165
  <td class="has-text-centered">0.665</td>
166
  <td class="has-text-centered">0.643</td>
167
  <td class="has-text-centered">0.651</td>
 
184
  <td class="has-text-centered">0.367</td>
185
  </tr>
186
  <tr>
187
+ <td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
188
  <td class="has-text-centered">0.540</td>
189
  <td class="has-text-centered">0.522</td>
190
  <td class="has-text-centered">0.526</td>
 
207
  <td class="has-text-centered">0.368</td>
208
  </tr>
209
  <tr>
210
+ <td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
211
  <td class="has-text-centered">0.653</td>
212
  <td class="has-text-centered">0.625</td>
213
  <td class="has-text-centered">0.635</td>
 
230
  <td class="has-text-centered">0.435</td>
231
  </tr>
232
  <tr>
233
+ <td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
234
  <td class="has-text-centered">0.613</td>
235
  <td class="has-text-centered">0.591</td>
236
  <td class="has-text-centered">0.598</td>
 
253
  <td class="has-text-centered">0.267</td>
254
  </tr>
255
  <tr>
256
+ <td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
257
  <td class="has-text-centered">0.766</td>
258
  <td class="has-text-centered">0.742</td>
259
  <td class="has-text-centered">0.748</td>
 
276
  <td class="has-text-centered">0.483</td>
277
  </tr>
278
  <tr>
279
+ <td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
280
  <td class="has-text-centered">0.755</td>
281
  <td class="has-text-centered">0.741</td>
282
  <td class="has-text-centered">0.744</td>
 
299
  <td class="has-text-centered">0.226</td>
300
  </tr>
301
  <tr>
302
+ <td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
303
  <td class="has-text-centered performance-medium">0.798</td>
304
  <td class="has-text-centered performance-medium">0.787</td>
305
  <td class="has-text-centered performance-medium">0.790</td>
 
322
  <td class="has-text-centered">0.549</td>
323
  </tr>
324
  <tr>
325
+ <td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
326
  <td class="has-text-centered performance-best">0.813</td>
327
  <td class="has-text-centered performance-best">0.805</td>
328
  <td class="has-text-centered performance-best">0.807</td>
 
345
  <td class="has-text-centered performance-medium">0.587</td>
346
  </tr>
347
  <tr>
348
+ <td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
349
  <td class="has-text-centered">0.695</td>
350
  <td class="has-text-centered">0.681</td>
351
  <td class="has-text-centered">0.685</td>
 
368
  <td class="has-text-centered">0.005</td>
369
  </tr>
370
  <tr>
371
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
372
  <td class="has-text-centered">0.564</td>
373
  <td class="has-text-centered">0.556</td>
374
  <td class="has-text-centered">0.552</td>
 
391
  <td class="has-text-centered">0.132</td>
392
  </tr>
393
  <tr>
394
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
395
  <td class="has-text-centered">0.707</td>
396
  <td class="has-text-centered">0.687</td>
397
  <td class="has-text-centered">0.693</td>
 
414
  <td class="has-text-centered">0.397</td>
415
  </tr>
416
  <tr>
417
+ <td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
418
  <td class="has-text-centered performance-strong">0.811</td>
419
  <td class="has-text-centered performance-strong">0.794</td>
420
  <td class="has-text-centered performance-strong">0.799</td>
 
437
  <td class="has-text-centered performance-strong">0.655</td>
438
  </tr>
439
  <tr>
440
+ <td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
441
  <td class="has-text-centered">0.732</td>
442
  <td class="has-text-centered">0.700</td>
443
  <td class="has-text-centered">0.711</td>
 
460
  <td class="has-text-centered">0.494</td>
461
  </tr>
462
  <tr>
463
+ <td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
464
  <td class="has-text-centered">0.769</td>
465
  <td class="has-text-centered">0.750</td>
466
  <td class="has-text-centered">0.756</td>
 
483
  <td class="has-text-centered">0.452</td>
484
  </tr>
485
  <tr>
486
+ <td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
487
  <td class="has-text-centered">0.728</td>
488
  <td class="has-text-centered">0.705</td>
489
  <td class="has-text-centered">0.712</td>
 
506
  <td class="has-text-centered">0.393</td>
507
  </tr>
508
  <tr>
509
+ <td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
510
  <td class="has-text-centered">0.778</td>
511
  <td class="has-text-centered">0.760</td>
512
  <td class="has-text-centered">0.766</td>
 
529
  <td class="has-text-centered">0.523</td>
530
  </tr>
531
  <tr>
532
+ <td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
533
  <td class="has-text-centered">0.772</td>
534
  <td class="has-text-centered">0.755</td>
535
  <td class="has-text-centered">0.761</td>
qa_table.html CHANGED
@@ -25,139 +25,139 @@
25
  </thead>
26
  <tbody>
27
  <tr>
28
- <td>Llama 3 70B Instruct</td>
29
  <td class="has-text-centered">0.809</td>
30
  <td class="has-text-centered">0.709</td>
31
  <td class="has-text-centered">0.772</td>
32
  </tr>
33
  <tr>
34
- <td>Llama 3 8B Instruct</td>
35
  <td class="has-text-centered">0.767</td>
36
  <td class="has-text-centered">0.268</td>
37
  <td class="has-text-centered">0.706</td>
38
  </tr>
39
  <tr>
40
- <td>DBRX Instruct</td>
41
  <td class="has-text-centered">0.738</td>
42
  <td class="has-text-centered">0.252</td>
43
  <td class="has-text-centered">0.633</td>
44
  </tr>
45
  <tr>
46
- <td>DeepSeek LLM (67B)</td>
47
  <td class="has-text-centered">0.742</td>
48
  <td class="has-text-centered">0.174</td>
49
  <td class="has-text-centered">0.355</td>
50
  </tr>
51
  <tr>
52
- <td>Gemma 2 27B</td>
53
  <td class="has-text-centered">0.768</td>
54
  <td class="has-text-centered">0.268</td>
55
  <td class="has-text-centered">0.734</td>
56
  </tr>
57
  <tr>
58
- <td>Gemma 2 9B</td>
59
  <td class="has-text-centered">0.779</td>
60
  <td class="has-text-centered">0.292</td>
61
  <td class="has-text-centered">0.750</td>
62
  </tr>
63
  <tr>
64
- <td>Mistral (7B) Instruct v0.3</td>
65
  <td class="has-text-centered">0.655</td>
66
  <td class="has-text-centered">0.199</td>
67
  <td class="has-text-centered">0.553</td>
68
  </tr>
69
  <tr>
70
- <td>Mixtral-8x22B Instruct</td>
71
  <td class="has-text-centered">0.766</td>
72
  <td class="has-text-centered">0.285</td>
73
  <td class="has-text-centered">0.666</td>
74
  </tr>
75
  <tr>
76
- <td>Mixtral-8x7B Instruct</td>
77
  <td class="has-text-centered">0.611</td>
78
  <td class="has-text-centered">0.315</td>
79
  <td class="has-text-centered">0.501</td>
80
  </tr>
81
  <tr>
82
- <td>Qwen 2 Instruct (72B)</td>
83
  <td class="has-text-centered">0.819</td>
84
  <td class="has-text-centered">0.269</td>
85
  <td class="has-text-centered">0.715</td>
86
  </tr>
87
  <tr>
88
- <td>WizardLM-2 8x22B</td>
89
  <td class="has-text-centered">0.796</td>
90
  <td class="has-text-centered">0.247</td>
91
  <td class="has-text-centered">0.725</td>
92
  </tr>
93
  <tr>
94
- <td>DeepSeek-V3</td>
95
  <td class="has-text-centered performance-medium">0.840</td>
96
  <td class="has-text-centered">0.261</td>
97
  <td class="has-text-centered performance-low">0.779</td>
98
  </tr>
99
  <tr>
100
- <td>DeepSeek R1</td>
101
  <td class="has-text-centered performance-low">0.836</td>
102
  <td class="has-text-centered performance-best">0.853</td>
103
  <td class="has-text-centered performance-best">0.858</td>
104
  </tr>
105
  <tr>
106
- <td>QwQ-32B-Preview</td>
107
  <td class="has-text-centered">0.793</td>
108
  <td class="has-text-centered">0.282</td>
109
  <td class="has-text-centered performance-medium">0.796</td>
110
  </tr>
111
  <tr>
112
- <td>Jamba 1.5 Mini</td>
113
  <td class="has-text-centered">0.666</td>
114
  <td class="has-text-centered">0.218</td>
115
  <td class="has-text-centered">0.586</td>
116
  </tr>
117
  <tr>
118
- <td>Jamba 1.5 Large</td>
119
  <td class="has-text-centered">0.790</td>
120
  <td class="has-text-centered">0.225</td>
121
  <td class="has-text-centered">0.660</td>
122
  </tr>
123
  <tr>
124
- <td>Claude 3.5 Sonnet</td>
125
  <td class="has-text-centered performance-best">0.844</td>
126
  <td class="has-text-centered">0.402</td>
127
  <td class="has-text-centered">0.700</td>
128
  </tr>
129
  <tr>
130
- <td>Claude 3 Haiku</td>
131
  <td class="has-text-centered">0.803</td>
132
  <td class="has-text-centered">0.421</td>
133
  <td class="has-text-centered">0.733</td>
134
  </tr>
135
  <tr>
136
- <td>Cohere Command R 7B</td>
137
  <td class="has-text-centered">0.709</td>
138
  <td class="has-text-centered">0.212</td>
139
  <td class="has-text-centered">0.716</td>
140
  </tr>
141
  <tr>
142
- <td>Cohere Command R +</td>
143
  <td class="has-text-centered">0.776</td>
144
  <td class="has-text-centered">0.259</td>
145
  <td class="has-text-centered">0.698</td>
146
  </tr>
147
  <tr>
148
- <td>Google Gemini 1.5 Pro</td>
149
  <td class="has-text-centered">0.829</td>
150
  <td class="has-text-centered">0.280</td>
151
  <td class="has-text-centered">0.763</td>
152
  </tr>
153
  <tr>
154
- <td>OpenAI gpt-4o</td>
155
  <td class="has-text-centered performance-low">0.836</td>
156
  <td class="has-text-centered performance-low">0.749</td>
157
  <td class="has-text-centered">0.754</td>
158
  </tr>
159
  <tr>
160
- <td>OpenAI o1-mini</td>
161
  <td class="has-text-centered">0.799</td>
162
  <td class="has-text-centered performance-medium">0.840</td>
163
  <td class="has-text-centered">0.698</td>
 
25
  </thead>
26
  <tbody>
27
  <tr>
28
+ <td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
29
  <td class="has-text-centered">0.809</td>
30
  <td class="has-text-centered">0.709</td>
31
  <td class="has-text-centered">0.772</td>
32
  </tr>
33
  <tr>
34
+ <td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
35
  <td class="has-text-centered">0.767</td>
36
  <td class="has-text-centered">0.268</td>
37
  <td class="has-text-centered">0.706</td>
38
  </tr>
39
  <tr>
40
+ <td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
41
  <td class="has-text-centered">0.738</td>
42
  <td class="has-text-centered">0.252</td>
43
  <td class="has-text-centered">0.633</td>
44
  </tr>
45
  <tr>
46
+ <td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
47
  <td class="has-text-centered">0.742</td>
48
  <td class="has-text-centered">0.174</td>
49
  <td class="has-text-centered">0.355</td>
50
  </tr>
51
  <tr>
52
+ <td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
53
  <td class="has-text-centered">0.768</td>
54
  <td class="has-text-centered">0.268</td>
55
  <td class="has-text-centered">0.734</td>
56
  </tr>
57
  <tr>
58
+ <td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
59
  <td class="has-text-centered">0.779</td>
60
  <td class="has-text-centered">0.292</td>
61
  <td class="has-text-centered">0.750</td>
62
  </tr>
63
  <tr>
64
+ <td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
65
  <td class="has-text-centered">0.655</td>
66
  <td class="has-text-centered">0.199</td>
67
  <td class="has-text-centered">0.553</td>
68
  </tr>
69
  <tr>
70
+ <td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
71
  <td class="has-text-centered">0.766</td>
72
  <td class="has-text-centered">0.285</td>
73
  <td class="has-text-centered">0.666</td>
74
  </tr>
75
  <tr>
76
+ <td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
77
  <td class="has-text-centered">0.611</td>
78
  <td class="has-text-centered">0.315</td>
79
  <td class="has-text-centered">0.501</td>
80
  </tr>
81
  <tr>
82
+ <td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
83
  <td class="has-text-centered">0.819</td>
84
  <td class="has-text-centered">0.269</td>
85
  <td class="has-text-centered">0.715</td>
86
  </tr>
87
  <tr>
88
+ <td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
89
  <td class="has-text-centered">0.796</td>
90
  <td class="has-text-centered">0.247</td>
91
  <td class="has-text-centered">0.725</td>
92
  </tr>
93
  <tr>
94
+ <td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
95
  <td class="has-text-centered performance-medium">0.840</td>
96
  <td class="has-text-centered">0.261</td>
97
  <td class="has-text-centered performance-low">0.779</td>
98
  </tr>
99
  <tr>
100
+ <td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
101
  <td class="has-text-centered performance-low">0.836</td>
102
  <td class="has-text-centered performance-best">0.853</td>
103
  <td class="has-text-centered performance-best">0.858</td>
104
  </tr>
105
  <tr>
106
+ <td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
107
  <td class="has-text-centered">0.793</td>
108
  <td class="has-text-centered">0.282</td>
109
  <td class="has-text-centered performance-medium">0.796</td>
110
  </tr>
111
  <tr>
112
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
113
  <td class="has-text-centered">0.666</td>
114
  <td class="has-text-centered">0.218</td>
115
  <td class="has-text-centered">0.586</td>
116
  </tr>
117
  <tr>
118
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
119
  <td class="has-text-centered">0.790</td>
120
  <td class="has-text-centered">0.225</td>
121
  <td class="has-text-centered">0.660</td>
122
  </tr>
123
  <tr>
124
+ <td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
125
  <td class="has-text-centered performance-best">0.844</td>
126
  <td class="has-text-centered">0.402</td>
127
  <td class="has-text-centered">0.700</td>
128
  </tr>
129
  <tr>
130
+ <td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
131
  <td class="has-text-centered">0.803</td>
132
  <td class="has-text-centered">0.421</td>
133
  <td class="has-text-centered">0.733</td>
134
  </tr>
135
  <tr>
136
+ <td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
137
  <td class="has-text-centered">0.709</td>
138
  <td class="has-text-centered">0.212</td>
139
  <td class="has-text-centered">0.716</td>
140
  </tr>
141
  <tr>
142
+ <td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
143
  <td class="has-text-centered">0.776</td>
144
  <td class="has-text-centered">0.259</td>
145
  <td class="has-text-centered">0.698</td>
146
  </tr>
147
  <tr>
148
+ <td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
149
  <td class="has-text-centered">0.829</td>
150
  <td class="has-text-centered">0.280</td>
151
  <td class="has-text-centered">0.763</td>
152
  </tr>
153
  <tr>
154
+ <td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
155
  <td class="has-text-centered performance-low">0.836</td>
156
  <td class="has-text-centered performance-low">0.749</td>
157
  <td class="has-text-centered">0.754</td>
158
  </tr>
159
  <tr>
160
+ <td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
161
  <td class="has-text-centered">0.799</td>
162
  <td class="has-text-centered performance-medium">0.840</td>
163
  <td class="has-text-centered">0.698</td>
results.html CHANGED
@@ -3135,5 +3135,6 @@
3135
  <script src="static/js/tooltips.js"></script>
3136
  <script src="static/js/fixed-tooltips.js"></script>
3137
  <script src="static/js/tooltip-fix.js"></script>
 
3138
  </body>
3139
  </html>
 
3135
  <script src="static/js/tooltips.js"></script>
3136
  <script src="static/js/fixed-tooltips.js"></script>
3137
  <script src="static/js/tooltip-fix.js"></script>
3138
+ <script src="static/js/model-tooltips.js"></script>
3139
  </body>
3140
  </html>
sentiment_analysis_table.html CHANGED
@@ -35,7 +35,7 @@
35
  </thead>
36
  <tbody>
37
  <tr>
38
- <td>Llama 3 70B Instruct</td>
39
  <td class="has-text-centered">0.123</td>
40
  <td class="has-text-centered">0.290</td>
41
  <td class="has-text-centered">0.272</td>
@@ -49,7 +49,7 @@
49
  <td class="has-text-centered">0.573</td>
50
  </tr>
51
  <tr>
52
- <td>Llama 3 8B Instruct</td>
53
  <td class="has-text-centered">0.161</td>
54
  <td class="has-text-centered">0.344</td>
55
  <td class="has-text-centered">0.045</td>
@@ -63,7 +63,7 @@
63
  <td class="has-text-centered">0.625</td>
64
  </tr>
65
  <tr>
66
- <td>DBRX Instruct</td>
67
  <td class="has-text-centered">0.160</td>
68
  <td class="has-text-centered">0.321</td>
69
  <td class="has-text-centered">0.052</td>
@@ -77,7 +77,7 @@
77
  <td class="has-text-centered">0.541</td>
78
  </tr>
79
  <tr>
80
- <td>DeepSeek LLM (67B)</td>
81
  <td class="has-text-centered">0.118</td>
82
  <td class="has-text-centered">0.278</td>
83
  <td class="has-text-centered">0.302</td>
@@ -91,7 +91,7 @@
91
  <td class="has-text-centered">0.544</td>
92
  </tr>
93
  <tr>
94
- <td>Gemma 2 27B</td>
95
  <td class="has-text-centered performance-best">0.100</td>
96
  <td class="has-text-centered performance-best">0.266</td>
97
  <td class="has-text-centered">0.406</td>
@@ -105,7 +105,7 @@
105
  <td class="has-text-centered">0.524</td>
106
  </tr>
107
  <tr>
108
- <td>Gemma 2 9B</td>
109
  <td class="has-text-centered">0.189</td>
110
  <td class="has-text-centered">0.352</td>
111
  <td class="has-text-centered">-0.120</td>
@@ -119,7 +119,7 @@
119
  <td class="has-text-centered">0.499</td>
120
  </tr>
121
  <tr>
122
- <td>Mistral (7B) Instruct v0.3</td>
123
  <td class="has-text-centered">0.135</td>
124
  <td class="has-text-centered">0.278</td>
125
  <td class="has-text-centered">0.200</td>
@@ -133,7 +133,7 @@
133
  <td class="has-text-centered">0.542</td>
134
  </tr>
135
  <tr>
136
- <td>Mixtral-8x22B Instruct</td>
137
  <td class="has-text-centered">0.221</td>
138
  <td class="has-text-centered">0.364</td>
139
  <td class="has-text-centered">-0.310</td>
@@ -147,7 +147,7 @@
147
  <td class="has-text-centered">0.538</td>
148
  </tr>
149
  <tr>
150
- <td>Mixtral-8x7B Instruct</td>
151
  <td class="has-text-centered">0.208</td>
152
  <td class="has-text-centered">0.307</td>
153
  <td class="has-text-centered">-0.229</td>
@@ -161,7 +161,7 @@
161
  <td class="has-text-centered">0.518</td>
162
  </tr>
163
  <tr>
164
- <td>Qwen 2 Instruct (72B)</td>
165
  <td class="has-text-centered">0.205</td>
166
  <td class="has-text-centered">0.409</td>
167
  <td class="has-text-centered">-0.212</td>
@@ -175,7 +175,7 @@
175
  <td class="has-text-centered">0.601</td>
176
  </tr>
177
  <tr>
178
- <td>WizardLM-2 8x22B</td>
179
  <td class="has-text-centered">0.129</td>
180
  <td class="has-text-centered">0.283</td>
181
  <td class="has-text-centered">0.239</td>
@@ -189,7 +189,7 @@
189
  <td class="has-text-centered">0.570</td>
190
  </tr>
191
  <tr>
192
- <td>DeepSeek-V3</td>
193
  <td class="has-text-centered">0.150</td>
194
  <td class="has-text-centered">0.311</td>
195
  <td class="has-text-centered">0.111</td>
@@ -203,7 +203,7 @@
203
  <td class="has-text-centered">0.572</td>
204
  </tr>
205
  <tr>
206
- <td>DeepSeek R1</td>
207
  <td class="has-text-centered performance-low">0.110</td>
208
  <td class="has-text-centered">0.289</td>
209
  <td class="has-text-centered">0.348</td>
@@ -217,7 +217,7 @@
217
  <td class="has-text-centered">0.489</td>
218
  </tr>
219
  <tr>
220
- <td>QwQ-32B-Preview</td>
221
  <td class="has-text-centered">0.141</td>
222
  <td class="has-text-centered">0.290</td>
223
  <td class="has-text-centered">0.165</td>
@@ -231,7 +231,7 @@
231
  <td class="has-text-centered">0.534</td>
232
  </tr>
233
  <tr>
234
- <td>Jamba 1.5 Mini</td>
235
  <td class="has-text-centered performance-low">0.119</td>
236
  <td class="has-text-centered">0.282</td>
237
  <td class="has-text-centered">0.293</td>
@@ -245,7 +245,7 @@
245
  <td class="has-text-centered">0.525</td>
246
  </tr>
247
  <tr>
248
- <td>Jamba 1.5 Large</td>
249
  <td class="has-text-centered">0.183</td>
250
  <td class="has-text-centered">0.363</td>
251
  <td class="has-text-centered">-0.085</td>
@@ -259,7 +259,7 @@
259
  <td class="has-text-centered">0.573</td>
260
  </tr>
261
  <tr>
262
- <td>Claude 3.5 Sonnet</td>
263
  <td class="has-text-centered performance-low">0.101</td>
264
  <td class="has-text-centered performance-low">0.268</td>
265
  <td class="has-text-centered performance-best">0.402</td>
@@ -273,7 +273,7 @@
273
  <td class="has-text-centered performance-medium">0.585</td>
274
  </tr>
275
  <tr>
276
- <td>Claude 3 Haiku</td>
277
  <td class="has-text-centered">0.167</td>
278
  <td class="has-text-centered">0.349</td>
279
  <td class="has-text-centered">0.008</td>
@@ -287,7 +287,7 @@
287
  <td class="has-text-centered">0.538</td>
288
  </tr>
289
  <tr>
290
- <td>Cohere Command R 7B</td>
291
  <td class="has-text-centered">0.164</td>
292
  <td class="has-text-centered">0.319</td>
293
  <td class="has-text-centered">0.028</td>
@@ -301,7 +301,7 @@
301
  <td class="has-text-centered">0.547</td>
302
  </tr>
303
  <tr>
304
- <td>Cohere Command R +</td>
305
  <td class="has-text-centered performance-medium">0.106</td>
306
  <td class="has-text-centered">0.274</td>
307
  <td class="has-text-centered performance-medium">0.373</td>
@@ -315,7 +315,7 @@
315
  <td class="has-text-centered">0.547</td>
316
  </tr>
317
  <tr>
318
- <td>Google Gemini 1.5 Pro</td>
319
  <td class="has-text-centered">0.144</td>
320
  <td class="has-text-centered">0.329</td>
321
  <td class="has-text-centered">0.149</td>
@@ -329,7 +329,7 @@
329
  <td class="has-text-centered performance-best">0.587</td>
330
  </tr>
331
  <tr>
332
- <td>OpenAI gpt-4o</td>
333
  <td class="has-text-centered">0.184</td>
334
  <td class="has-text-centered">0.317</td>
335
  <td class="has-text-centered">-0.089</td>
@@ -343,7 +343,7 @@
343
  <td class="has-text-centered">0.515</td>
344
  </tr>
345
  <tr>
346
- <td>OpenAI o1-mini</td>
347
  <td class="has-text-centered performance-medium">0.120</td>
348
  <td class="has-text-centered">0.295</td>
349
  <td class="has-text-centered">0.289</td>
 
35
  </thead>
36
  <tbody>
37
  <tr>
38
+ <td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
39
  <td class="has-text-centered">0.123</td>
40
  <td class="has-text-centered">0.290</td>
41
  <td class="has-text-centered">0.272</td>
 
49
  <td class="has-text-centered">0.573</td>
50
  </tr>
51
  <tr>
52
+ <td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
53
  <td class="has-text-centered">0.161</td>
54
  <td class="has-text-centered">0.344</td>
55
  <td class="has-text-centered">0.045</td>
 
63
  <td class="has-text-centered">0.625</td>
64
  </tr>
65
  <tr>
66
+ <td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
67
  <td class="has-text-centered">0.160</td>
68
  <td class="has-text-centered">0.321</td>
69
  <td class="has-text-centered">0.052</td>
 
77
  <td class="has-text-centered">0.541</td>
78
  </tr>
79
  <tr>
80
+ <td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
81
  <td class="has-text-centered">0.118</td>
82
  <td class="has-text-centered">0.278</td>
83
  <td class="has-text-centered">0.302</td>
 
91
  <td class="has-text-centered">0.544</td>
92
  </tr>
93
  <tr>
94
+ <td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
95
  <td class="has-text-centered performance-best">0.100</td>
96
  <td class="has-text-centered performance-best">0.266</td>
97
  <td class="has-text-centered">0.406</td>
 
105
  <td class="has-text-centered">0.524</td>
106
  </tr>
107
  <tr>
108
+ <td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
109
  <td class="has-text-centered">0.189</td>
110
  <td class="has-text-centered">0.352</td>
111
  <td class="has-text-centered">-0.120</td>
 
119
  <td class="has-text-centered">0.499</td>
120
  </tr>
121
  <tr>
122
+ <td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
123
  <td class="has-text-centered">0.135</td>
124
  <td class="has-text-centered">0.278</td>
125
  <td class="has-text-centered">0.200</td>
 
133
  <td class="has-text-centered">0.542</td>
134
  </tr>
135
  <tr>
136
+ <td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
137
  <td class="has-text-centered">0.221</td>
138
  <td class="has-text-centered">0.364</td>
139
  <td class="has-text-centered">-0.310</td>
 
147
  <td class="has-text-centered">0.538</td>
148
  </tr>
149
  <tr>
150
+ <td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
151
  <td class="has-text-centered">0.208</td>
152
  <td class="has-text-centered">0.307</td>
153
  <td class="has-text-centered">-0.229</td>
 
161
  <td class="has-text-centered">0.518</td>
162
  </tr>
163
  <tr>
164
+ <td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
165
  <td class="has-text-centered">0.205</td>
166
  <td class="has-text-centered">0.409</td>
167
  <td class="has-text-centered">-0.212</td>
 
175
  <td class="has-text-centered">0.601</td>
176
  </tr>
177
  <tr>
178
+ <td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
179
  <td class="has-text-centered">0.129</td>
180
  <td class="has-text-centered">0.283</td>
181
  <td class="has-text-centered">0.239</td>
 
189
  <td class="has-text-centered">0.570</td>
190
  </tr>
191
  <tr>
192
+ <td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
193
  <td class="has-text-centered">0.150</td>
194
  <td class="has-text-centered">0.311</td>
195
  <td class="has-text-centered">0.111</td>
 
203
  <td class="has-text-centered">0.572</td>
204
  </tr>
205
  <tr>
206
+ <td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
207
  <td class="has-text-centered performance-low">0.110</td>
208
  <td class="has-text-centered">0.289</td>
209
  <td class="has-text-centered">0.348</td>
 
217
  <td class="has-text-centered">0.489</td>
218
  </tr>
219
  <tr>
220
+ <td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
221
  <td class="has-text-centered">0.141</td>
222
  <td class="has-text-centered">0.290</td>
223
  <td class="has-text-centered">0.165</td>
 
231
  <td class="has-text-centered">0.534</td>
232
  </tr>
233
  <tr>
234
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
235
  <td class="has-text-centered performance-low">0.119</td>
236
  <td class="has-text-centered">0.282</td>
237
  <td class="has-text-centered">0.293</td>
 
245
  <td class="has-text-centered">0.525</td>
246
  </tr>
247
  <tr>
248
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
249
  <td class="has-text-centered">0.183</td>
250
  <td class="has-text-centered">0.363</td>
251
  <td class="has-text-centered">-0.085</td>
 
259
  <td class="has-text-centered">0.573</td>
260
  </tr>
261
  <tr>
262
+ <td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
263
  <td class="has-text-centered performance-low">0.101</td>
264
  <td class="has-text-centered performance-low">0.268</td>
265
  <td class="has-text-centered performance-best">0.402</td>
 
273
  <td class="has-text-centered performance-medium">0.585</td>
274
  </tr>
275
  <tr>
276
+ <td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
277
  <td class="has-text-centered">0.167</td>
278
  <td class="has-text-centered">0.349</td>
279
  <td class="has-text-centered">0.008</td>
 
287
  <td class="has-text-centered">0.538</td>
288
  </tr>
289
  <tr>
290
+ <td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
291
  <td class="has-text-centered">0.164</td>
292
  <td class="has-text-centered">0.319</td>
293
  <td class="has-text-centered">0.028</td>
 
301
  <td class="has-text-centered">0.547</td>
302
  </tr>
303
  <tr>
304
+ <td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
305
  <td class="has-text-centered performance-medium">0.106</td>
306
  <td class="has-text-centered">0.274</td>
307
  <td class="has-text-centered performance-medium">0.373</td>
 
315
  <td class="has-text-centered">0.547</td>
316
  </tr>
317
  <tr>
318
+ <td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
319
  <td class="has-text-centered">0.144</td>
320
  <td class="has-text-centered">0.329</td>
321
  <td class="has-text-centered">0.149</td>
 
329
  <td class="has-text-centered performance-best">0.587</td>
330
  </tr>
331
  <tr>
332
+ <td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
333
  <td class="has-text-centered">0.184</td>
334
  <td class="has-text-centered">0.317</td>
335
  <td class="has-text-centered">-0.089</td>
 
343
  <td class="has-text-centered">0.515</td>
344
  </tr>
345
  <tr>
346
+ <td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
347
  <td class="has-text-centered performance-medium">0.120</td>
348
  <td class="has-text-centered">0.295</td>
349
  <td class="has-text-centered">0.289</td>
static/js/model-tooltips.js ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ document.addEventListener('DOMContentLoaded', function() {
2
+ // Fix model tooltips in all tabs
3
+ function fixAllModelTooltips() {
4
+ console.log("Fixing model tooltips in all tabs");
5
+
6
+ // Find all model name cells (first column in all tables)
7
+ const modelCells = document.querySelectorAll('td:first-child');
8
+
9
+ // Process each model cell
10
+ modelCells.forEach(cell => {
11
+ // Skip cells that already have tooltips
12
+ if (cell.classList.contains('tooltip-trigger')) {
13
+ return;
14
+ }
15
+
16
+ // Get the model name
17
+ const modelName = cell.textContent.trim();
18
+
19
+ // Add tooltip-trigger class and position style
20
+ cell.classList.add('tooltip-trigger');
21
+ cell.style.position = 'relative';
22
+
23
+ // Add data-title attribute with the model name
24
+ cell.setAttribute('data-title', modelName);
25
+
26
+ // Add descriptive tooltip based on model
27
+ let tooltipText = "";
28
+
29
+ // Set descriptive tooltip based on model name - exact descriptions from cost analysis tab
30
+ if (modelName.includes("GPT-4o") || modelName.includes("gpt-4o")) {
31
+ tooltipText = "OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.";
32
+ } else if (modelName.includes("o1-mini")) {
33
+ tooltipText = "OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.";
34
+ } else if (modelName.includes("Claude 3.5 Sonnet")) {
35
+ tooltipText = "Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.";
36
+ } else if (modelName.includes("Claude 3 Haiku")) {
37
+ tooltipText = "Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.";
38
+ } else if (modelName.includes("Gemini 1.5")) {
39
+ tooltipText = "Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.";
40
+ } else if (modelName.includes("Command R 7B")) {
41
+ tooltipText = "Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.";
42
+ } else if (modelName.includes("Command R +")) {
43
+ tooltipText = "Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.";
44
+ } else if (modelName.includes("DeepSeek R1")) {
45
+ tooltipText = "DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.";
46
+ } else if (modelName.includes("DeepSeek-V3") || modelName.includes("DeepSeek V3")) {
47
+ tooltipText = "DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.";
48
+ } else if (modelName.includes("DeepSeek LLM")) {
49
+ tooltipText = "DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.";
50
+ } else if (modelName.includes("Llama 3 70B")) {
51
+ tooltipText = "Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.";
52
+ } else if (modelName.includes("Llama 3 8B")) {
53
+ tooltipText = "Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.";
54
+ } else if (modelName.includes("DBRX")) {
55
+ tooltipText = "Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.";
56
+ } else if (modelName.includes("Mixtral-8x22B")) {
57
+ tooltipText = "Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.";
58
+ } else if (modelName.includes("Mixtral-8x7B")) {
59
+ tooltipText = "Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.";
60
+ } else if (modelName.includes("Mistral")) {
61
+ tooltipText = "Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.";
62
+ } else if (modelName.includes("Qwen 2")) {
63
+ tooltipText = "Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.";
64
+ } else if (modelName.includes("WizardLM")) {
65
+ tooltipText = "A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.";
66
+ } else if (modelName.includes("Gemma 2 27B")) {
67
+ tooltipText = "Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.";
68
+ } else if (modelName.includes("Gemma 2 9B")) {
69
+ tooltipText = "Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.";
70
+ } else if (modelName.includes("QwQ-32B")) {
71
+ tooltipText = "Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.";
72
+ } else if (modelName.includes("Jamba 1.5 Mini")) {
73
+ tooltipText = "A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.";
74
+ } else if (modelName.includes("Jamba 1.5 Large")) {
75
+ tooltipText = "An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.";
76
+ } else {
77
+ tooltipText = "A language model evaluated in the FLaME financial benchmark. Assessed across multiple financial NLP tasks including classification, summarization, QA, and more.";
78
+ }
79
+
80
+ // Set the tooltip
81
+ cell.setAttribute('data-tooltip', tooltipText);
82
+ });
83
+
84
+ // After adding attributes, run the tooltip fix
85
+ if (window.fixProblemTooltips) {
86
+ window.fixProblemTooltips();
87
+ }
88
+ }
89
+
90
+ // Run on page load
91
+ setTimeout(fixAllModelTooltips, 500);
92
+
93
+ // Run when tabs are clicked
94
+ const tabs = document.querySelectorAll('.tabs li');
95
+ tabs.forEach(tab => {
96
+ tab.addEventListener('click', () => {
97
+ // Give time for content to be displayed
98
+ setTimeout(fixAllModelTooltips, 200);
99
+ });
100
+ });
101
+ });
text_classification_table.html CHANGED
@@ -43,7 +43,7 @@
43
  </thead>
44
  <tbody>
45
  <tr>
46
- <td>Llama 3 70B Instruct</td>
47
  <td class="has-text-centered">0.660</td>
48
  <td class="has-text-centered">0.748</td>
49
  <td class="has-text-centered">0.660</td>
@@ -63,7 +63,7 @@
63
  <td class="has-text-centered">0.811</td>
64
  </tr>
65
  <tr>
66
- <td>Llama 3 8B Instruct</td>
67
  <td class="has-text-centered">0.534</td>
68
  <td class="has-text-centered">0.672</td>
69
  <td class="has-text-centered">0.534</td>
@@ -83,7 +83,7 @@
83
  <td class="has-text-centered">0.763</td>
84
  </tr>
85
  <tr>
86
- <td>DBRX Instruct</td>
87
  <td class="has-text-centered">0.578</td>
88
  <td class="has-text-centered">0.706</td>
89
  <td class="has-text-centered">0.578</td>
@@ -103,7 +103,7 @@
103
  <td class="has-text-centered">0.746</td>
104
  </tr>
105
  <tr>
106
- <td>DeepSeek LLM (67B)</td>
107
  <td class="has-text-centered">0.596</td>
108
  <td class="has-text-centered">0.711</td>
109
  <td class="has-text-centered">0.596</td>
@@ -123,7 +123,7 @@
123
  <td class="has-text-centered">0.778</td>
124
  </tr>
125
  <tr>
126
- <td>Gemma 2 27B</td>
127
  <td class="has-text-centered">0.639</td>
128
  <td class="has-text-centered">0.730</td>
129
  <td class="has-text-centered">0.639</td>
@@ -143,7 +143,7 @@
143
  <td class="has-text-centered">0.808</td>
144
  </tr>
145
  <tr>
146
- <td>Gemma 2 9B</td>
147
  <td class="has-text-centered">0.630</td>
148
  <td class="has-text-centered">0.710</td>
149
  <td class="has-text-centered">0.630</td>
@@ -163,7 +163,7 @@
163
  <td class="has-text-centered performance-best">0.856</td>
164
  </tr>
165
  <tr>
166
- <td>Mistral (7B) Instruct v0.3</td>
167
  <td class="has-text-centered">0.547</td>
168
  <td class="has-text-centered">0.677</td>
169
  <td class="has-text-centered">0.547</td>
@@ -183,7 +183,7 @@
183
  <td class="has-text-centered">0.779</td>
184
  </tr>
185
  <tr>
186
- <td>Mixtral-8x22B Instruct</td>
187
  <td class="has-text-centered">0.622</td>
188
  <td class="has-text-centered">0.718</td>
189
  <td class="has-text-centered">0.622</td>
@@ -203,7 +203,7 @@
203
  <td class="has-text-centered performance-medium">0.835</td>
204
  </tr>
205
  <tr>
206
- <td>Mixtral-8x7B Instruct</td>
207
  <td class="has-text-centered">0.567</td>
208
  <td class="has-text-centered">0.693</td>
209
  <td class="has-text-centered">0.567</td>
@@ -223,7 +223,7 @@
223
  <td class="has-text-centered">0.805</td>
224
  </tr>
225
  <tr>
226
- <td>Qwen 2 Instruct (72B)</td>
227
  <td class="has-text-centered">0.644</td>
228
  <td class="has-text-centered">0.730</td>
229
  <td class="has-text-centered">0.644</td>
@@ -243,7 +243,7 @@
243
  <td class="has-text-centered">0.830</td>
244
  </tr>
245
  <tr>
246
- <td>WizardLM-2 8x22B</td>
247
  <td class="has-text-centered">0.664</td>
248
  <td class="has-text-centered">0.737</td>
249
  <td class="has-text-centered">0.664</td>
@@ -263,7 +263,7 @@
263
  <td class="has-text-centered">0.797</td>
264
  </tr>
265
  <tr>
266
- <td>DeepSeek-V3</td>
267
  <td class="has-text-centered performance-strong">0.722</td>
268
  <td class="has-text-centered performance-medium">0.774</td>
269
  <td class="has-text-centered performance-strong">0.722</td>
@@ -283,7 +283,7 @@
283
  <td class="has-text-centered">0.729</td>
284
  </tr>
285
  <tr>
286
- <td>DeepSeek R1</td>
287
  <td class="has-text-centered performance-best">0.772</td>
288
  <td class="has-text-centered performance-strong">0.789</td>
289
  <td class="has-text-centered performance-best">0.772</td>
@@ -303,7 +303,7 @@
303
  <td class="has-text-centered">0.769</td>
304
  </tr>
305
  <tr>
306
- <td>QwQ-32B-Preview</td>
307
  <td class="has-text-centered">0.577</td>
308
  <td class="has-text-centered">0.747</td>
309
  <td class="has-text-centered">0.577</td>
@@ -323,7 +323,7 @@
323
  <td class="has-text-centered">0.744</td>
324
  </tr>
325
  <tr>
326
- <td>Jamba 1.5 Mini</td>
327
  <td class="has-text-centered">0.528</td>
328
  <td class="has-text-centered">0.630</td>
329
  <td class="has-text-centered">0.528</td>
@@ -343,7 +343,7 @@
343
  <td class="has-text-centered">0.682</td>
344
  </tr>
345
  <tr>
346
- <td>Jamba 1.5 Large</td>
347
  <td class="has-text-centered">0.642</td>
348
  <td class="has-text-centered">0.746</td>
349
  <td class="has-text-centered">0.642</td>
@@ -363,7 +363,7 @@
363
  <td class="has-text-centered">0.782</td>
364
  </tr>
365
  <tr>
366
- <td>Claude 3.5 Sonnet</td>
367
  <td class="has-text-centered">0.682</td>
368
  <td class="has-text-centered">0.755</td>
369
  <td class="has-text-centered">0.682</td>
@@ -383,7 +383,7 @@
383
  <td class="has-text-centered">0.827</td>
384
  </tr>
385
  <tr>
386
- <td>Claude 3 Haiku</td>
387
  <td class="has-text-centered">0.639</td>
388
  <td class="has-text-centered">0.735</td>
389
  <td class="has-text-centered">0.639</td>
@@ -403,7 +403,7 @@
403
  <td class="has-text-centered">0.781</td>
404
  </tr>
405
  <tr>
406
- <td>Cohere Command R 7B</td>
407
  <td class="has-text-centered">0.530</td>
408
  <td class="has-text-centered">0.650</td>
409
  <td class="has-text-centered">0.530</td>
@@ -423,7 +423,7 @@
423
  <td class="has-text-centered">0.770</td>
424
  </tr>
425
  <tr>
426
- <td>Cohere Command R +</td>
427
  <td class="has-text-centered">0.660</td>
428
  <td class="has-text-centered">0.747</td>
429
  <td class="has-text-centered">0.660</td>
@@ -443,7 +443,7 @@
443
  <td class="has-text-centered">0.812</td>
444
  </tr>
445
  <tr>
446
- <td>Google Gemini 1.5 Pro</td>
447
  <td class="has-text-centered">0.483</td>
448
  <td class="has-text-centered">0.487</td>
449
  <td class="has-text-centered">0.483</td>
@@ -463,7 +463,7 @@
463
  <td class="has-text-centered performance-strong">0.837</td>
464
  </tr>
465
  <tr>
466
- <td>OpenAI gpt-4o</td>
467
  <td class="has-text-centered performance-medium">0.704</td>
468
  <td class="has-text-centered performance-best">0.792</td>
469
  <td class="has-text-centered performance-medium">0.704</td>
@@ -483,7 +483,7 @@
483
  <td class="has-text-centered">0.824</td>
484
  </tr>
485
  <tr>
486
- <td>OpenAI o1-mini</td>
487
  <td class="has-text-centered">0.681</td>
488
  <td class="has-text-centered">0.760</td>
489
  <td class="has-text-centered">0.681</td>
 
43
  </thead>
44
  <tbody>
45
  <tr>
46
+ <td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
47
  <td class="has-text-centered">0.660</td>
48
  <td class="has-text-centered">0.748</td>
49
  <td class="has-text-centered">0.660</td>
 
63
  <td class="has-text-centered">0.811</td>
64
  </tr>
65
  <tr>
66
+ <td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
67
  <td class="has-text-centered">0.534</td>
68
  <td class="has-text-centered">0.672</td>
69
  <td class="has-text-centered">0.534</td>
 
83
  <td class="has-text-centered">0.763</td>
84
  </tr>
85
  <tr>
86
+ <td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
87
  <td class="has-text-centered">0.578</td>
88
  <td class="has-text-centered">0.706</td>
89
  <td class="has-text-centered">0.578</td>
 
103
  <td class="has-text-centered">0.746</td>
104
  </tr>
105
  <tr>
106
+ <td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
107
  <td class="has-text-centered">0.596</td>
108
  <td class="has-text-centered">0.711</td>
109
  <td class="has-text-centered">0.596</td>
 
123
  <td class="has-text-centered">0.778</td>
124
  </tr>
125
  <tr>
126
+ <td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
127
  <td class="has-text-centered">0.639</td>
128
  <td class="has-text-centered">0.730</td>
129
  <td class="has-text-centered">0.639</td>
 
143
  <td class="has-text-centered">0.808</td>
144
  </tr>
145
  <tr>
146
+ <td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
147
  <td class="has-text-centered">0.630</td>
148
  <td class="has-text-centered">0.710</td>
149
  <td class="has-text-centered">0.630</td>
 
163
  <td class="has-text-centered performance-best">0.856</td>
164
  </tr>
165
  <tr>
166
+ <td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
167
  <td class="has-text-centered">0.547</td>
168
  <td class="has-text-centered">0.677</td>
169
  <td class="has-text-centered">0.547</td>
 
183
  <td class="has-text-centered">0.779</td>
184
  </tr>
185
  <tr>
186
+ <td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
187
  <td class="has-text-centered">0.622</td>
188
  <td class="has-text-centered">0.718</td>
189
  <td class="has-text-centered">0.622</td>
 
203
  <td class="has-text-centered performance-medium">0.835</td>
204
  </tr>
205
  <tr>
206
+ <td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
207
  <td class="has-text-centered">0.567</td>
208
  <td class="has-text-centered">0.693</td>
209
  <td class="has-text-centered">0.567</td>
 
223
  <td class="has-text-centered">0.805</td>
224
  </tr>
225
  <tr>
226
+ <td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
227
  <td class="has-text-centered">0.644</td>
228
  <td class="has-text-centered">0.730</td>
229
  <td class="has-text-centered">0.644</td>
 
243
  <td class="has-text-centered">0.830</td>
244
  </tr>
245
  <tr>
246
+ <td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
247
  <td class="has-text-centered">0.664</td>
248
  <td class="has-text-centered">0.737</td>
249
  <td class="has-text-centered">0.664</td>
 
263
  <td class="has-text-centered">0.797</td>
264
  </tr>
265
  <tr>
266
+ <td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
267
  <td class="has-text-centered performance-strong">0.722</td>
268
  <td class="has-text-centered performance-medium">0.774</td>
269
  <td class="has-text-centered performance-strong">0.722</td>
 
283
  <td class="has-text-centered">0.729</td>
284
  </tr>
285
  <tr>
286
+ <td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
287
  <td class="has-text-centered performance-best">0.772</td>
288
  <td class="has-text-centered performance-strong">0.789</td>
289
  <td class="has-text-centered performance-best">0.772</td>
 
303
  <td class="has-text-centered">0.769</td>
304
  </tr>
305
  <tr>
306
+ <td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
307
  <td class="has-text-centered">0.577</td>
308
  <td class="has-text-centered">0.747</td>
309
  <td class="has-text-centered">0.577</td>
 
323
  <td class="has-text-centered">0.744</td>
324
  </tr>
325
  <tr>
326
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
327
  <td class="has-text-centered">0.528</td>
328
  <td class="has-text-centered">0.630</td>
329
  <td class="has-text-centered">0.528</td>
 
343
  <td class="has-text-centered">0.682</td>
344
  </tr>
345
  <tr>
346
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
347
  <td class="has-text-centered">0.642</td>
348
  <td class="has-text-centered">0.746</td>
349
  <td class="has-text-centered">0.642</td>
 
363
  <td class="has-text-centered">0.782</td>
364
  </tr>
365
  <tr>
366
+ <td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
367
  <td class="has-text-centered">0.682</td>
368
  <td class="has-text-centered">0.755</td>
369
  <td class="has-text-centered">0.682</td>
 
383
  <td class="has-text-centered">0.827</td>
384
  </tr>
385
  <tr>
386
+ <td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
387
  <td class="has-text-centered">0.639</td>
388
  <td class="has-text-centered">0.735</td>
389
  <td class="has-text-centered">0.639</td>
 
403
  <td class="has-text-centered">0.781</td>
404
  </tr>
405
  <tr>
406
+ <td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
407
  <td class="has-text-centered">0.530</td>
408
  <td class="has-text-centered">0.650</td>
409
  <td class="has-text-centered">0.530</td>
 
423
  <td class="has-text-centered">0.770</td>
424
  </tr>
425
  <tr>
426
+ <td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
427
  <td class="has-text-centered">0.660</td>
428
  <td class="has-text-centered">0.747</td>
429
  <td class="has-text-centered">0.660</td>
 
443
  <td class="has-text-centered">0.812</td>
444
  </tr>
445
  <tr>
446
+ <td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
447
  <td class="has-text-centered">0.483</td>
448
  <td class="has-text-centered">0.487</td>
449
  <td class="has-text-centered">0.483</td>
 
463
  <td class="has-text-centered performance-strong">0.837</td>
464
  </tr>
465
  <tr>
466
+ <td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
467
  <td class="has-text-centered performance-medium">0.704</td>
468
  <td class="has-text-centered performance-best">0.792</td>
469
  <td class="has-text-centered performance-medium">0.704</td>
 
483
  <td class="has-text-centered">0.824</td>
484
  </tr>
485
  <tr>
486
+ <td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
487
  <td class="has-text-centered">0.681</td>
488
  <td class="has-text-centered">0.760</td>
489
  <td class="has-text-centered">0.681</td>
text_summarization_table.html CHANGED
@@ -29,7 +29,7 @@
29
  </thead>
30
  <tbody>
31
  <tr>
32
- <td>Llama 3 70B Instruct</td>
33
  <td class="has-text-centered">0.715</td>
34
  <td class="has-text-centered">0.801</td>
35
  <td class="has-text-centered">0.754</td>
@@ -38,7 +38,7 @@
38
  <td class="has-text-centered performance-strong">0.817</td>
39
  </tr>
40
  <tr>
41
- <td>Llama 3 8B Instruct</td>
42
  <td class="has-text-centered">0.724</td>
43
  <td class="has-text-centered">0.796</td>
44
  <td class="has-text-centered">0.757</td>
@@ -47,7 +47,7 @@
47
  <td class="has-text-centered">0.811</td>
48
  </tr>
49
  <tr>
50
- <td>DBRX Instruct</td>
51
  <td class="has-text-centered">0.680</td>
52
  <td class="has-text-centered">0.786</td>
53
  <td class="has-text-centered">0.729</td>
@@ -56,7 +56,7 @@
56
  <td class="has-text-centered">0.806</td>
57
  </tr>
58
  <tr>
59
- <td>DeepSeek LLM (67B)</td>
60
  <td class="has-text-centered">0.692</td>
61
  <td class="has-text-centered">0.678</td>
62
  <td class="has-text-centered">0.681</td>
@@ -65,7 +65,7 @@
65
  <td class="has-text-centered">0.807</td>
66
  </tr>
67
  <tr>
68
- <td>Gemma 2 27B</td>
69
  <td class="has-text-centered">0.680</td>
70
  <td class="has-text-centered">0.777</td>
71
  <td class="has-text-centered">0.723</td>
@@ -74,7 +74,7 @@
74
  <td class="has-text-centered">0.814</td>
75
  </tr>
76
  <tr>
77
- <td>Gemma 2 9B</td>
78
  <td class="has-text-centered">0.651</td>
79
  <td class="has-text-centered">0.531</td>
80
  <td class="has-text-centered">0.585</td>
@@ -83,7 +83,7 @@
83
  <td class="has-text-centered performance-strong">0.817</td>
84
  </tr>
85
  <tr>
86
- <td>Mistral (7B) Instruct v0.3</td>
87
  <td class="has-text-centered">0.702</td>
88
  <td class="has-text-centered performance-strong">0.806</td>
89
  <td class="has-text-centered">0.750</td>
@@ -92,7 +92,7 @@
92
  <td class="has-text-centered">0.811</td>
93
  </tr>
94
  <tr>
95
- <td>Mixtral-8x22B Instruct</td>
96
  <td class="has-text-centered">0.713</td>
97
  <td class="has-text-centered performance-best">0.812</td>
98
  <td class="has-text-centered">0.758</td>
@@ -101,7 +101,7 @@
101
  <td class="has-text-centered">0.815</td>
102
  </tr>
103
  <tr>
104
- <td>Mixtral-8x7B Instruct</td>
105
  <td class="has-text-centered">0.727</td>
106
  <td class="has-text-centered">0.773</td>
107
  <td class="has-text-centered">0.747</td>
@@ -110,7 +110,7 @@
110
  <td class="has-text-centered">0.810</td>
111
  </tr>
112
  <tr>
113
- <td>Qwen 2 Instruct (72B)</td>
114
  <td class="has-text-centered">0.709</td>
115
  <td class="has-text-centered performance-medium">0.804</td>
116
  <td class="has-text-centered">0.752</td>
@@ -119,7 +119,7 @@
119
  <td class="has-text-centered">0.811</td>
120
  </tr>
121
  <tr>
122
- <td>WizardLM-2 8x22B</td>
123
  <td class="has-text-centered">0.677</td>
124
  <td class="has-text-centered performance-strong">0.806</td>
125
  <td class="has-text-centered">0.735</td>
@@ -128,7 +128,7 @@
128
  <td class="has-text-centered">0.808</td>
129
  </tr>
130
  <tr>
131
- <td>DeepSeek-V3</td>
132
  <td class="has-text-centered">0.703</td>
133
  <td class="has-text-centered performance-strong">0.806</td>
134
  <td class="has-text-centered">0.750</td>
@@ -137,7 +137,7 @@
137
  <td class="has-text-centered">0.815</td>
138
  </tr>
139
  <tr>
140
- <td>DeepSeek R1</td>
141
  <td class="has-text-centered">0.724</td>
142
  <td class="has-text-centered">0.800</td>
143
  <td class="has-text-centered">0.759</td>
@@ -146,7 +146,7 @@
146
  <td class="has-text-centered">0.804</td>
147
  </tr>
148
  <tr>
149
- <td>QwQ-32B-Preview</td>
150
  <td class="has-text-centered">0.653</td>
151
  <td class="has-text-centered">0.751</td>
152
  <td class="has-text-centered">0.696</td>
@@ -155,7 +155,7 @@
155
  <td class="has-text-centered performance-strong">0.817</td>
156
  </tr>
157
  <tr>
158
- <td>Jamba 1.5 Mini</td>
159
  <td class="has-text-centered">0.692</td>
160
  <td class="has-text-centered">0.798</td>
161
  <td class="has-text-centered">0.741</td>
@@ -164,7 +164,7 @@
164
  <td class="has-text-centered performance-medium">0.816</td>
165
  </tr>
166
  <tr>
167
- <td>Jamba 1.5 Large</td>
168
  <td class="has-text-centered">0.679</td>
169
  <td class="has-text-centered">0.800</td>
170
  <td class="has-text-centered">0.734</td>
@@ -173,7 +173,7 @@
173
  <td class="has-text-centered performance-best">0.818</td>
174
  </tr>
175
  <tr>
176
- <td>Claude 3.5 Sonnet</td>
177
  <td class="has-text-centered performance-medium">0.737</td>
178
  <td class="has-text-centered">0.802</td>
179
  <td class="has-text-centered performance-medium">0.767</td>
@@ -182,7 +182,7 @@
182
  <td class="has-text-centered">0.813</td>
183
  </tr>
184
  <tr>
185
- <td>Claude 3 Haiku</td>
186
  <td class="has-text-centered">0.683</td>
187
  <td class="has-text-centered">0.617</td>
188
  <td class="has-text-centered">0.646</td>
@@ -191,7 +191,7 @@
191
  <td class="has-text-centered">0.808</td>
192
  </tr>
193
  <tr>
194
- <td>Cohere Command R 7B</td>
195
  <td class="has-text-centered">0.724</td>
196
  <td class="has-text-centered">0.781</td>
197
  <td class="has-text-centered">0.750</td>
@@ -200,7 +200,7 @@
200
  <td class="has-text-centered">0.815</td>
201
  </tr>
202
  <tr>
203
- <td>Cohere Command R +</td>
204
  <td class="has-text-centered">0.724</td>
205
  <td class="has-text-centered">0.782</td>
206
  <td class="has-text-centered">0.751</td>
@@ -209,7 +209,7 @@
209
  <td class="has-text-centered">0.810</td>
210
  </tr>
211
  <tr>
212
- <td>Google Gemini 1.5 Pro</td>
213
  <td class="has-text-centered performance-best">0.757</td>
214
  <td class="has-text-centered">0.800</td>
215
  <td class="has-text-centered performance-best">0.777</td>
@@ -218,7 +218,7 @@
218
  <td class="has-text-centered performance-strong">0.817</td>
219
  </tr>
220
  <tr>
221
- <td>OpenAI gpt-4o</td>
222
  <td class="has-text-centered performance-strong">0.755</td>
223
  <td class="has-text-centered">0.793</td>
224
  <td class="has-text-centered performance-strong">0.773</td>
@@ -227,7 +227,7 @@
227
  <td class="has-text-centered performance-medium">0.816</td>
228
  </tr>
229
  <tr>
230
- <td>OpenAI o1-mini</td>
231
  <td class="has-text-centered">0.731</td>
232
  <td class="has-text-centered">0.801</td>
233
  <td class="has-text-centered">0.763</td>
 
29
  </thead>
30
  <tbody>
31
  <tr>
32
+ <td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
33
  <td class="has-text-centered">0.715</td>
34
  <td class="has-text-centered">0.801</td>
35
  <td class="has-text-centered">0.754</td>
 
38
  <td class="has-text-centered performance-strong">0.817</td>
39
  </tr>
40
  <tr>
41
+ <td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
42
  <td class="has-text-centered">0.724</td>
43
  <td class="has-text-centered">0.796</td>
44
  <td class="has-text-centered">0.757</td>
 
47
  <td class="has-text-centered">0.811</td>
48
  </tr>
49
  <tr>
50
+ <td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
51
  <td class="has-text-centered">0.680</td>
52
  <td class="has-text-centered">0.786</td>
53
  <td class="has-text-centered">0.729</td>
 
56
  <td class="has-text-centered">0.806</td>
57
  </tr>
58
  <tr>
59
+ <td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
60
  <td class="has-text-centered">0.692</td>
61
  <td class="has-text-centered">0.678</td>
62
  <td class="has-text-centered">0.681</td>
 
65
  <td class="has-text-centered">0.807</td>
66
  </tr>
67
  <tr>
68
+ <td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
69
  <td class="has-text-centered">0.680</td>
70
  <td class="has-text-centered">0.777</td>
71
  <td class="has-text-centered">0.723</td>
 
74
  <td class="has-text-centered">0.814</td>
75
  </tr>
76
  <tr>
77
+ <td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
78
  <td class="has-text-centered">0.651</td>
79
  <td class="has-text-centered">0.531</td>
80
  <td class="has-text-centered">0.585</td>
 
83
  <td class="has-text-centered performance-strong">0.817</td>
84
  </tr>
85
  <tr>
86
+ <td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
87
  <td class="has-text-centered">0.702</td>
88
  <td class="has-text-centered performance-strong">0.806</td>
89
  <td class="has-text-centered">0.750</td>
 
92
  <td class="has-text-centered">0.811</td>
93
  </tr>
94
  <tr>
95
+ <td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
96
  <td class="has-text-centered">0.713</td>
97
  <td class="has-text-centered performance-best">0.812</td>
98
  <td class="has-text-centered">0.758</td>
 
101
  <td class="has-text-centered">0.815</td>
102
  </tr>
103
  <tr>
104
+ <td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
105
  <td class="has-text-centered">0.727</td>
106
  <td class="has-text-centered">0.773</td>
107
  <td class="has-text-centered">0.747</td>
 
110
  <td class="has-text-centered">0.810</td>
111
  </tr>
112
  <tr>
113
+ <td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
114
  <td class="has-text-centered">0.709</td>
115
  <td class="has-text-centered performance-medium">0.804</td>
116
  <td class="has-text-centered">0.752</td>
 
119
  <td class="has-text-centered">0.811</td>
120
  </tr>
121
  <tr>
122
+ <td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
123
  <td class="has-text-centered">0.677</td>
124
  <td class="has-text-centered performance-strong">0.806</td>
125
  <td class="has-text-centered">0.735</td>
 
128
  <td class="has-text-centered">0.808</td>
129
  </tr>
130
  <tr>
131
+ <td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
132
  <td class="has-text-centered">0.703</td>
133
  <td class="has-text-centered performance-strong">0.806</td>
134
  <td class="has-text-centered">0.750</td>
 
137
  <td class="has-text-centered">0.815</td>
138
  </tr>
139
  <tr>
140
+ <td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
141
  <td class="has-text-centered">0.724</td>
142
  <td class="has-text-centered">0.800</td>
143
  <td class="has-text-centered">0.759</td>
 
146
  <td class="has-text-centered">0.804</td>
147
  </tr>
148
  <tr>
149
+ <td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
150
  <td class="has-text-centered">0.653</td>
151
  <td class="has-text-centered">0.751</td>
152
  <td class="has-text-centered">0.696</td>
 
155
  <td class="has-text-centered performance-strong">0.817</td>
156
  </tr>
157
  <tr>
158
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
159
  <td class="has-text-centered">0.692</td>
160
  <td class="has-text-centered">0.798</td>
161
  <td class="has-text-centered">0.741</td>
 
164
  <td class="has-text-centered performance-medium">0.816</td>
165
  </tr>
166
  <tr>
167
+ <td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
168
  <td class="has-text-centered">0.679</td>
169
  <td class="has-text-centered">0.800</td>
170
  <td class="has-text-centered">0.734</td>
 
173
  <td class="has-text-centered performance-best">0.818</td>
174
  </tr>
175
  <tr>
176
+ <td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
177
  <td class="has-text-centered performance-medium">0.737</td>
178
  <td class="has-text-centered">0.802</td>
179
  <td class="has-text-centered performance-medium">0.767</td>
 
182
  <td class="has-text-centered">0.813</td>
183
  </tr>
184
  <tr>
185
+ <td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
186
  <td class="has-text-centered">0.683</td>
187
  <td class="has-text-centered">0.617</td>
188
  <td class="has-text-centered">0.646</td>
 
191
  <td class="has-text-centered">0.808</td>
192
  </tr>
193
  <tr>
194
+ <td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
195
  <td class="has-text-centered">0.724</td>
196
  <td class="has-text-centered">0.781</td>
197
  <td class="has-text-centered">0.750</td>
 
200
  <td class="has-text-centered">0.815</td>
201
  </tr>
202
  <tr>
203
+ <td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
204
  <td class="has-text-centered">0.724</td>
205
  <td class="has-text-centered">0.782</td>
206
  <td class="has-text-centered">0.751</td>
 
209
  <td class="has-text-centered">0.810</td>
210
  </tr>
211
  <tr>
212
+ <td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
213
  <td class="has-text-centered performance-best">0.757</td>
214
  <td class="has-text-centered">0.800</td>
215
  <td class="has-text-centered performance-best">0.777</td>
 
218
  <td class="has-text-centered performance-strong">0.817</td>
219
  </tr>
220
  <tr>
221
+ <td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
222
  <td class="has-text-centered performance-strong">0.755</td>
223
  <td class="has-text-centered">0.793</td>
224
  <td class="has-text-centered performance-strong">0.773</td>
 
227
  <td class="has-text-centered performance-medium">0.816</td>
228
  </tr>
229
  <tr>
230
+ <td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
231
  <td class="has-text-centered">0.731</td>
232
  <td class="has-text-centered">0.801</td>
233
  <td class="has-text-centered">0.763</td>