Spaces:
Running
Running
model tooltips on all pages
Browse files- causal_analysis_table.html +23 -23
- fix_tooltips.sh +199 -1
- information_retrieval_table.html +22 -22
- qa_table.html +23 -23
- results.html +1 -0
- sentiment_analysis_table.html +23 -23
- static/js/model-tooltips.js +101 -0
- text_classification_table.html +23 -23
- text_summarization_table.html +23 -23
causal_analysis_table.html
CHANGED
@@ -31,7 +31,7 @@
|
|
31 |
</thead>
|
32 |
<tbody>
|
33 |
<tr>
|
34 |
-
<td>Llama 3 70B Instruct</td>
|
35 |
<td class="has-text-centered">0.148</td>
|
36 |
<td class="has-text-centered">0.429</td>
|
37 |
<td class="has-text-centered">0.148</td>
|
@@ -42,7 +42,7 @@
|
|
42 |
<td class="has-text-centered">0.198</td>
|
43 |
</tr>
|
44 |
<tr>
|
45 |
-
<td>Llama 3 8B Instruct</td>
|
46 |
<td class="has-text-centered">0.097</td>
|
47 |
<td class="has-text-centered">0.341</td>
|
48 |
<td class="has-text-centered">0.097</td>
|
@@ -53,7 +53,7 @@
|
|
53 |
<td class="has-text-centered performance-strong">0.380</td>
|
54 |
</tr>
|
55 |
<tr>
|
56 |
-
<td>DBRX Instruct</td>
|
57 |
<td class="has-text-centered">0.078</td>
|
58 |
<td class="has-text-centered">0.521</td>
|
59 |
<td class="has-text-centered">0.078</td>
|
@@ -64,7 +64,7 @@
|
|
64 |
<td class="has-text-centered">0.235</td>
|
65 |
</tr>
|
66 |
<tr>
|
67 |
-
<td>DeepSeek LLM (67B)</td>
|
68 |
<td class="has-text-centered">0.026</td>
|
69 |
<td class="has-text-centered">0.214</td>
|
70 |
<td class="has-text-centered">0.026</td>
|
@@ -75,7 +75,7 @@
|
|
75 |
<td class="has-text-centered">0.221</td>
|
76 |
</tr>
|
77 |
<tr>
|
78 |
-
<td>Gemma 2 27B</td>
|
79 |
<td class="has-text-centered">0.115</td>
|
80 |
<td class="has-text-centered">0.510</td>
|
81 |
<td class="has-text-centered">0.115</td>
|
@@ -86,7 +86,7 @@
|
|
86 |
<td class="has-text-centered">0.262</td>
|
87 |
</tr>
|
88 |
<tr>
|
89 |
-
<td>Gemma 2 9B</td>
|
90 |
<td class="has-text-centered">0.115</td>
|
91 |
<td class="has-text-centered">0.394</td>
|
92 |
<td class="has-text-centered">0.115</td>
|
@@ -97,7 +97,7 @@
|
|
97 |
<td class="has-text-centered">0.258</td>
|
98 |
</tr>
|
99 |
<tr>
|
100 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
101 |
<td class="has-text-centered">0.078</td>
|
102 |
<td class="has-text-centered">0.455</td>
|
103 |
<td class="has-text-centered">0.078</td>
|
@@ -108,7 +108,7 @@
|
|
108 |
<td class="has-text-centered">0.258</td>
|
109 |
</tr>
|
110 |
<tr>
|
111 |
-
<td>Mixtral-8x22B Instruct</td>
|
112 |
<td class="has-text-centered">0.131</td>
|
113 |
<td class="has-text-centered">0.486</td>
|
114 |
<td class="has-text-centered">0.131</td>
|
@@ -119,7 +119,7 @@
|
|
119 |
<td class="has-text-centered performance-medium">0.318</td>
|
120 |
</tr>
|
121 |
<tr>
|
122 |
-
<td>Mixtral-8x7B Instruct</td>
|
123 |
<td class="has-text-centered">0.088</td>
|
124 |
<td class="has-text-centered">0.510</td>
|
125 |
<td class="has-text-centered">0.088</td>
|
@@ -130,7 +130,7 @@
|
|
130 |
<td class="has-text-centered">0.273</td>
|
131 |
</tr>
|
132 |
<tr>
|
133 |
-
<td>Qwen 2 Instruct (72B)</td>
|
134 |
<td class="has-text-centered">0.139</td>
|
135 |
<td class="has-text-centered">0.489</td>
|
136 |
<td class="has-text-centered">0.139</td>
|
@@ -141,7 +141,7 @@
|
|
141 |
<td class="has-text-centered">0.188</td>
|
142 |
</tr>
|
143 |
<tr>
|
144 |
-
<td>WizardLM-2 8x22B</td>
|
145 |
<td class="has-text-centered">0.076</td>
|
146 |
<td class="has-text-centered">0.453</td>
|
147 |
<td class="has-text-centered">0.076</td>
|
@@ -152,7 +152,7 @@
|
|
152 |
<td class="has-text-centered">0.237</td>
|
153 |
</tr>
|
154 |
<tr>
|
155 |
-
<td>DeepSeek-V3</td>
|
156 |
<td class="has-text-centered">0.164</td>
|
157 |
<td class="has-text-centered">0.528</td>
|
158 |
<td class="has-text-centered">0.164</td>
|
@@ -163,7 +163,7 @@
|
|
163 |
<td class="has-text-centered">0.248</td>
|
164 |
</tr>
|
165 |
<tr>
|
166 |
-
<td>DeepSeek R1</td>
|
167 |
<td class="has-text-centered performance-best">0.245</td>
|
168 |
<td class="has-text-centered performance-strong">0.643</td>
|
169 |
<td class="has-text-centered performance-best">0.245</td>
|
@@ -174,7 +174,7 @@
|
|
174 |
<td class="has-text-centered">0.221</td>
|
175 |
</tr>
|
176 |
<tr>
|
177 |
-
<td>QwQ-32B-Preview</td>
|
178 |
<td class="has-text-centered">0.110</td>
|
179 |
<td class="has-text-centered">0.473</td>
|
180 |
<td class="has-text-centered">0.110</td>
|
@@ -185,7 +185,7 @@
|
|
185 |
<td class="has-text-centered performance-best">0.465</td>
|
186 |
</tr>
|
187 |
<tr>
|
188 |
-
<td>Jamba 1.5 Mini</td>
|
189 |
<td class="has-text-centered">0.050</td>
|
190 |
<td class="has-text-centered">0.280</td>
|
191 |
<td class="has-text-centered">0.050</td>
|
@@ -196,7 +196,7 @@
|
|
196 |
<td class="has-text-centered">0.295</td>
|
197 |
</tr>
|
198 |
<tr>
|
199 |
-
<td>Jamba 1.5 Large</td>
|
200 |
<td class="has-text-centered">0.076</td>
|
201 |
<td class="has-text-centered">0.517</td>
|
202 |
<td class="has-text-centered">0.076</td>
|
@@ -207,7 +207,7 @@
|
|
207 |
<td class="has-text-centered">0.200</td>
|
208 |
</tr>
|
209 |
<tr>
|
210 |
-
<td>Claude 3.5 Sonnet</td>
|
211 |
<td class="has-text-centered">0.154</td>
|
212 |
<td class="has-text-centered">0.564</td>
|
213 |
<td class="has-text-centered">0.154</td>
|
@@ -218,7 +218,7 @@
|
|
218 |
<td class="has-text-centered">0.235</td>
|
219 |
</tr>
|
220 |
<tr>
|
221 |
-
<td>Claude 3 Haiku</td>
|
222 |
<td class="has-text-centered">0.082</td>
|
223 |
<td class="has-text-centered">0.388</td>
|
224 |
<td class="has-text-centered">0.082</td>
|
@@ -229,7 +229,7 @@
|
|
229 |
<td class="has-text-centered">0.203</td>
|
230 |
</tr>
|
231 |
<tr>
|
232 |
-
<td>Cohere Command R 7B</td>
|
233 |
<td class="has-text-centered">0.089</td>
|
234 |
<td class="has-text-centered">0.363</td>
|
235 |
<td class="has-text-centered">0.089</td>
|
@@ -240,7 +240,7 @@
|
|
240 |
<td class="has-text-centered">0.275</td>
|
241 |
</tr>
|
242 |
<tr>
|
243 |
-
<td>Cohere Command R +</td>
|
244 |
<td class="has-text-centered">0.090</td>
|
245 |
<td class="has-text-centered">0.453</td>
|
246 |
<td class="has-text-centered">0.090</td>
|
@@ -251,7 +251,7 @@
|
|
251 |
<td class="has-text-centered">0.265</td>
|
252 |
</tr>
|
253 |
<tr>
|
254 |
-
<td>Google Gemini 1.5 Pro</td>
|
255 |
<td class="has-text-centered performance-medium">0.165</td>
|
256 |
<td class="has-text-centered">0.514</td>
|
257 |
<td class="has-text-centered performance-medium">0.165</td>
|
@@ -262,7 +262,7 @@
|
|
262 |
<td class="has-text-centered">0.258</td>
|
263 |
</tr>
|
264 |
<tr>
|
265 |
-
<td>OpenAI gpt-4o</td>
|
266 |
<td class="has-text-centered">0.082</td>
|
267 |
<td class="has-text-centered performance-medium">0.576</td>
|
268 |
<td class="has-text-centered">0.082</td>
|
@@ -273,7 +273,7 @@
|
|
273 |
<td class="has-text-centered">0.235</td>
|
274 |
</tr>
|
275 |
<tr>
|
276 |
-
<td>OpenAI o1-mini</td>
|
277 |
<td class="has-text-centered performance-strong">0.206</td>
|
278 |
<td class="has-text-centered performance-best">0.648</td>
|
279 |
<td class="has-text-centered performance-strong">0.206</td>
|
|
|
31 |
</thead>
|
32 |
<tbody>
|
33 |
<tr>
|
34 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
35 |
<td class="has-text-centered">0.148</td>
|
36 |
<td class="has-text-centered">0.429</td>
|
37 |
<td class="has-text-centered">0.148</td>
|
|
|
42 |
<td class="has-text-centered">0.198</td>
|
43 |
</tr>
|
44 |
<tr>
|
45 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
46 |
<td class="has-text-centered">0.097</td>
|
47 |
<td class="has-text-centered">0.341</td>
|
48 |
<td class="has-text-centered">0.097</td>
|
|
|
53 |
<td class="has-text-centered performance-strong">0.380</td>
|
54 |
</tr>
|
55 |
<tr>
|
56 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
57 |
<td class="has-text-centered">0.078</td>
|
58 |
<td class="has-text-centered">0.521</td>
|
59 |
<td class="has-text-centered">0.078</td>
|
|
|
64 |
<td class="has-text-centered">0.235</td>
|
65 |
</tr>
|
66 |
<tr>
|
67 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
68 |
<td class="has-text-centered">0.026</td>
|
69 |
<td class="has-text-centered">0.214</td>
|
70 |
<td class="has-text-centered">0.026</td>
|
|
|
75 |
<td class="has-text-centered">0.221</td>
|
76 |
</tr>
|
77 |
<tr>
|
78 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
79 |
<td class="has-text-centered">0.115</td>
|
80 |
<td class="has-text-centered">0.510</td>
|
81 |
<td class="has-text-centered">0.115</td>
|
|
|
86 |
<td class="has-text-centered">0.262</td>
|
87 |
</tr>
|
88 |
<tr>
|
89 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
90 |
<td class="has-text-centered">0.115</td>
|
91 |
<td class="has-text-centered">0.394</td>
|
92 |
<td class="has-text-centered">0.115</td>
|
|
|
97 |
<td class="has-text-centered">0.258</td>
|
98 |
</tr>
|
99 |
<tr>
|
100 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
101 |
<td class="has-text-centered">0.078</td>
|
102 |
<td class="has-text-centered">0.455</td>
|
103 |
<td class="has-text-centered">0.078</td>
|
|
|
108 |
<td class="has-text-centered">0.258</td>
|
109 |
</tr>
|
110 |
<tr>
|
111 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
112 |
<td class="has-text-centered">0.131</td>
|
113 |
<td class="has-text-centered">0.486</td>
|
114 |
<td class="has-text-centered">0.131</td>
|
|
|
119 |
<td class="has-text-centered performance-medium">0.318</td>
|
120 |
</tr>
|
121 |
<tr>
|
122 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
123 |
<td class="has-text-centered">0.088</td>
|
124 |
<td class="has-text-centered">0.510</td>
|
125 |
<td class="has-text-centered">0.088</td>
|
|
|
130 |
<td class="has-text-centered">0.273</td>
|
131 |
</tr>
|
132 |
<tr>
|
133 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
134 |
<td class="has-text-centered">0.139</td>
|
135 |
<td class="has-text-centered">0.489</td>
|
136 |
<td class="has-text-centered">0.139</td>
|
|
|
141 |
<td class="has-text-centered">0.188</td>
|
142 |
</tr>
|
143 |
<tr>
|
144 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
145 |
<td class="has-text-centered">0.076</td>
|
146 |
<td class="has-text-centered">0.453</td>
|
147 |
<td class="has-text-centered">0.076</td>
|
|
|
152 |
<td class="has-text-centered">0.237</td>
|
153 |
</tr>
|
154 |
<tr>
|
155 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
156 |
<td class="has-text-centered">0.164</td>
|
157 |
<td class="has-text-centered">0.528</td>
|
158 |
<td class="has-text-centered">0.164</td>
|
|
|
163 |
<td class="has-text-centered">0.248</td>
|
164 |
</tr>
|
165 |
<tr>
|
166 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
167 |
<td class="has-text-centered performance-best">0.245</td>
|
168 |
<td class="has-text-centered performance-strong">0.643</td>
|
169 |
<td class="has-text-centered performance-best">0.245</td>
|
|
|
174 |
<td class="has-text-centered">0.221</td>
|
175 |
</tr>
|
176 |
<tr>
|
177 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
178 |
<td class="has-text-centered">0.110</td>
|
179 |
<td class="has-text-centered">0.473</td>
|
180 |
<td class="has-text-centered">0.110</td>
|
|
|
185 |
<td class="has-text-centered performance-best">0.465</td>
|
186 |
</tr>
|
187 |
<tr>
|
188 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
189 |
<td class="has-text-centered">0.050</td>
|
190 |
<td class="has-text-centered">0.280</td>
|
191 |
<td class="has-text-centered">0.050</td>
|
|
|
196 |
<td class="has-text-centered">0.295</td>
|
197 |
</tr>
|
198 |
<tr>
|
199 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
200 |
<td class="has-text-centered">0.076</td>
|
201 |
<td class="has-text-centered">0.517</td>
|
202 |
<td class="has-text-centered">0.076</td>
|
|
|
207 |
<td class="has-text-centered">0.200</td>
|
208 |
</tr>
|
209 |
<tr>
|
210 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
211 |
<td class="has-text-centered">0.154</td>
|
212 |
<td class="has-text-centered">0.564</td>
|
213 |
<td class="has-text-centered">0.154</td>
|
|
|
218 |
<td class="has-text-centered">0.235</td>
|
219 |
</tr>
|
220 |
<tr>
|
221 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
222 |
<td class="has-text-centered">0.082</td>
|
223 |
<td class="has-text-centered">0.388</td>
|
224 |
<td class="has-text-centered">0.082</td>
|
|
|
229 |
<td class="has-text-centered">0.203</td>
|
230 |
</tr>
|
231 |
<tr>
|
232 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
233 |
<td class="has-text-centered">0.089</td>
|
234 |
<td class="has-text-centered">0.363</td>
|
235 |
<td class="has-text-centered">0.089</td>
|
|
|
240 |
<td class="has-text-centered">0.275</td>
|
241 |
</tr>
|
242 |
<tr>
|
243 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
244 |
<td class="has-text-centered">0.090</td>
|
245 |
<td class="has-text-centered">0.453</td>
|
246 |
<td class="has-text-centered">0.090</td>
|
|
|
251 |
<td class="has-text-centered">0.265</td>
|
252 |
</tr>
|
253 |
<tr>
|
254 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
255 |
<td class="has-text-centered performance-medium">0.165</td>
|
256 |
<td class="has-text-centered">0.514</td>
|
257 |
<td class="has-text-centered performance-medium">0.165</td>
|
|
|
262 |
<td class="has-text-centered">0.258</td>
|
263 |
</tr>
|
264 |
<tr>
|
265 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
266 |
<td class="has-text-centered">0.082</td>
|
267 |
<td class="has-text-centered performance-medium">0.576</td>
|
268 |
<td class="has-text-centered">0.082</td>
|
|
|
273 |
<td class="has-text-centered">0.235</td>
|
274 |
</tr>
|
275 |
<tr>
|
276 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
277 |
<td class="has-text-centered performance-strong">0.206</td>
|
278 |
<td class="has-text-centered performance-best">0.648</td>
|
279 |
<td class="has-text-centered performance-strong">0.206</td>
|
fix_tooltips.sh
CHANGED
@@ -1,7 +1,47 @@
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
-
# Script to fix tooltips in all HTML files
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
# Fix tooltips in information_retrieval_table.html
|
6 |
sed -i 's/tooltip-trigger" data-tooltip="A dataset for information retrieval in the financial domain/tooltip-trigger tooltip-right" data-tooltip="A dataset for information retrieval in the financial domain/g' information_retrieval_table.html
|
7 |
|
@@ -17,4 +57,162 @@ sed -i 's/tooltip-trigger tooltip-right" data-tooltip="Manually-annotated datase
|
|
17 |
# Fix tooltips in text_summarization_table.html (in case the tooltip-right class isn't working)
|
18 |
sed -i 's/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/g' text_summarization_table.html
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
echo "Fixed tooltips in all HTML files"
|
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
+
# Script to add model tooltips and fix existing tooltips in all HTML files
|
4 |
|
5 |
+
# Model tooltip definitions - exact descriptions from cost analysis tab
|
6 |
+
declare -A model_tooltips
|
7 |
+
model_tooltips["OpenAI gpt-4o"]="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following."
|
8 |
+
model_tooltips["GPT-4o"]="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following."
|
9 |
+
model_tooltips["OpenAI o1-mini"]="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count."
|
10 |
+
model_tooltips["o1-mini"]="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count."
|
11 |
+
model_tooltips["Claude 3.5 Sonnet"]="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities."
|
12 |
+
model_tooltips["Claude 3 Haiku"]="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks."
|
13 |
+
model_tooltips["Google Gemini 1.5 Pro"]="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities."
|
14 |
+
model_tooltips["Gemini 1.5 Pro"]="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities."
|
15 |
+
model_tooltips["Cohere Command R 7B"]="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size."
|
16 |
+
model_tooltips["Cohere Command R +"]="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart."
|
17 |
+
model_tooltips["DeepSeek R1"]="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks."
|
18 |
+
model_tooltips["DeepSeek-V3"]="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities."
|
19 |
+
model_tooltips["DeepSeek LLM (67B)"]="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities."
|
20 |
+
model_tooltips["Llama 3 70B Instruct"]="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities."
|
21 |
+
model_tooltips["Llama 3 8B Instruct"]="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities."
|
22 |
+
model_tooltips["DBRX Instruct"]="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities."
|
23 |
+
model_tooltips["Mixtral-8x22B Instruct"]="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance."
|
24 |
+
model_tooltips["Mixtral-8x7B Instruct"]="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities."
|
25 |
+
model_tooltips["Mistral (7B) Instruct v0.3"]="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size."
|
26 |
+
model_tooltips["Qwen 2 Instruct (72B)"]="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities."
|
27 |
+
model_tooltips["WizardLM-2 8x22B"]="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks."
|
28 |
+
model_tooltips["Gemma 2 27B"]="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following."
|
29 |
+
model_tooltips["Gemma 2 9B"]="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size."
|
30 |
+
model_tooltips["QwQ-32B-Preview"]="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks."
|
31 |
+
model_tooltips["Jamba 1.5 Mini"]="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks."
|
32 |
+
model_tooltips["Jamba 1.5 Large"]="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart."
|
33 |
+
|
34 |
+
# Files to process
|
35 |
+
files=(
|
36 |
+
"text_classification_table.html"
|
37 |
+
"sentiment_analysis_table.html"
|
38 |
+
"information_retrieval_table.html"
|
39 |
+
"causal_analysis_table.html"
|
40 |
+
"text_summarization_table.html"
|
41 |
+
"qa_table.html"
|
42 |
+
)
|
43 |
+
|
44 |
+
# Fix existing dataset tooltips
|
45 |
# Fix tooltips in information_retrieval_table.html
|
46 |
sed -i 's/tooltip-trigger" data-tooltip="A dataset for information retrieval in the financial domain/tooltip-trigger tooltip-right" data-tooltip="A dataset for information retrieval in the financial domain/g' information_retrieval_table.html
|
47 |
|
|
|
57 |
# Fix tooltips in text_summarization_table.html (in case the tooltip-right class isn't working)
|
58 |
sed -i 's/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/tooltip-trigger tooltip-right" data-tooltip="Financial news summarization dataset with 2,000 financial news articles/g' text_summarization_table.html
|
59 |
|
60 |
+
# Add or update model tooltips to each file
|
61 |
+
for file in "${files[@]}"; do
|
62 |
+
echo "Processing $file..."
|
63 |
+
|
64 |
+
# For each model in our list
|
65 |
+
for model in "${!model_tooltips[@]}"; do
|
66 |
+
# Convert model name to a sed-safe string by escaping special characters
|
67 |
+
model_sed_safe=$(echo "$model" | sed 's/[\/&]/\\&/g')
|
68 |
+
tooltip_sed_safe=$(echo "${model_tooltips[$model]}" | sed 's/[\/&]/\\&/g')
|
69 |
+
|
70 |
+
# First, update existing tooltips if they exist
|
71 |
+
sed -i "s/data-title=\"$model_sed_safe\" data-tooltip=\"[^\"]*\"/data-title=\"$model_sed_safe\" data-tooltip=\"$tooltip_sed_safe\"/g" "$file"
|
72 |
+
|
73 |
+
# Then, add tooltips to plain model names without tooltips
|
74 |
+
sed -i "s/<td>$model_sed_safe<\/td>/<td class=\"tooltip-trigger tooltip-right\" data-title=\"$model_sed_safe\" data-tooltip=\"$tooltip_sed_safe\">$model_sed_safe<\/td>/g" "$file"
|
75 |
+
done
|
76 |
+
|
77 |
+
# Ensure tooltip script is included at the bottom of the file
|
78 |
+
if ! grep -q "tooltips.js" "$file"; then
|
79 |
+
echo "<script src=\"static/js/tooltips.js\"></script>" >> "$file"
|
80 |
+
fi
|
81 |
+
|
82 |
+
if ! grep -q "fixed-tooltips.js" "$file"; then
|
83 |
+
echo "<script src=\"static/js/fixed-tooltips.js\"></script>" >> "$file"
|
84 |
+
fi
|
85 |
+
|
86 |
+
# Add tooltips.css if not already included
|
87 |
+
if ! grep -q "tooltips.css" "$file"; then
|
88 |
+
sed -i '1i<link rel="stylesheet" href="static/css/tooltips.css">' "$file"
|
89 |
+
fi
|
90 |
+
done
|
91 |
+
|
92 |
+
# Also update results.html to ensure proper tooltip handling
|
93 |
+
echo "Adding tooltip fix to results.html..."
|
94 |
+
|
95 |
+
# Copy the model tooltip fixing code for all tabs to a new JS file
|
96 |
+
cat > static/js/model-tooltips.js << EOF
|
97 |
+
document.addEventListener('DOMContentLoaded', function() {
|
98 |
+
// Fix model tooltips in all tabs
|
99 |
+
function fixAllModelTooltips() {
|
100 |
+
console.log("Fixing model tooltips in all tabs");
|
101 |
+
|
102 |
+
// Find all model name cells (first column in all tables)
|
103 |
+
const modelCells = document.querySelectorAll('td:first-child');
|
104 |
+
|
105 |
+
// Process each model cell
|
106 |
+
modelCells.forEach(cell => {
|
107 |
+
// Skip cells that already have tooltips
|
108 |
+
if (cell.classList.contains('tooltip-trigger')) {
|
109 |
+
return;
|
110 |
+
}
|
111 |
+
|
112 |
+
// Get the model name
|
113 |
+
const modelName = cell.textContent.trim();
|
114 |
+
|
115 |
+
// Add tooltip-trigger class and position style
|
116 |
+
cell.classList.add('tooltip-trigger');
|
117 |
+
cell.style.position = 'relative';
|
118 |
+
|
119 |
+
// Add data-title attribute with the model name
|
120 |
+
cell.setAttribute('data-title', modelName);
|
121 |
+
|
122 |
+
// Add descriptive tooltip based on model
|
123 |
+
let tooltipText = "";
|
124 |
+
|
125 |
+
// Set descriptive tooltip based on model name
|
126 |
+
if (modelName.includes("GPT-4o")) {
|
127 |
+
tooltipText = "OpenAI's advanced proprietary closed-source model. One of the top performers across most tasks.";
|
128 |
+
} else if (modelName.includes("o1-mini")) {
|
129 |
+
tooltipText = "Compact proprietary model from OpenAI. Shows strong performance on causal analysis tasks.";
|
130 |
+
} else if (modelName.includes("Claude 3.5 Sonnet")) {
|
131 |
+
tooltipText = "Anthropic's model optimized for advanced reasoning. Strong performer on text classification and summarization.";
|
132 |
+
} else if (modelName.includes("Claude 3 Haiku")) {
|
133 |
+
tooltipText = "Anthropic's smaller, efficiency-focused model in the Claude series.";
|
134 |
+
} else if (modelName.includes("Gemini 1.5")) {
|
135 |
+
tooltipText = "Google's highly capable proprietary model.";
|
136 |
+
} else if (modelName.includes("Command R 7B")) {
|
137 |
+
tooltipText = "A 7-billion parameter model from Cohere focused on instruction-following.";
|
138 |
+
} else if (modelName.includes("Command R +")) {
|
139 |
+
tooltipText = "An improved version of Cohere's Command R model.";
|
140 |
+
} else if (modelName.includes("DeepSeek R1")) {
|
141 |
+
tooltipText = "Open-weight model from DeepSeek AI with 671B parameters (MoE architecture). One of the top performers in the benchmark.";
|
142 |
+
} else if (modelName.includes("DeepSeek-V3") || modelName.includes("DeepSeek V3")) {
|
143 |
+
tooltipText = "Open-weight model from DeepSeek AI with 685B parameters (MoE architecture).";
|
144 |
+
} else if (modelName.includes("DeepSeek LLM")) {
|
145 |
+
tooltipText = "A 67-billion parameter chat-optimized model from DeepSeek AI.";
|
146 |
+
} else if (modelName.includes("Llama 3 70B")) {
|
147 |
+
tooltipText = "Meta's 70-billion parameter dense model, optimized for instruction-following tasks.";
|
148 |
+
} else if (modelName.includes("Llama 3 8B")) {
|
149 |
+
tooltipText = "Meta's 8-billion parameter efficient model variant.";
|
150 |
+
} else if (modelName.includes("DBRX")) {
|
151 |
+
tooltipText = "Databricks' 132B parameter MoE model.";
|
152 |
+
} else if (modelName.includes("Mixtral-8x22B")) {
|
153 |
+
tooltipText = "141B parameter MoE model from Mistral AI with eight 22-billion parameter sub-models.";
|
154 |
+
} else if (modelName.includes("Mixtral-8x7B")) {
|
155 |
+
tooltipText = "46.7B parameter MoE model from Mistral AI with eight 7-billion parameter sub-models.";
|
156 |
+
} else if (modelName.includes("Mistral")) {
|
157 |
+
tooltipText = "A 7-billion parameter instruction-tuned model from Mistral AI.";
|
158 |
+
} else if (modelName.includes("Qwen 2")) {
|
159 |
+
tooltipText = "Alibaba's 72-billion parameter instruction-following model.";
|
160 |
+
} else if (modelName.includes("WizardLM")) {
|
161 |
+
tooltipText = "A 176B parameter MoE model focused on complex reasoning.";
|
162 |
+
} else if (modelName.includes("Gemma 2 27B")) {
|
163 |
+
tooltipText = "Google's open-weight 27B parameter model.";
|
164 |
+
} else if (modelName.includes("Gemma 2 9B")) {
|
165 |
+
tooltipText = "Google's open-weight 9B parameter efficient model.";
|
166 |
+
} else if (modelName.includes("QwQ-32B")) {
|
167 |
+
tooltipText = "Qwen's experimental MoE model with 32B parameters.";
|
168 |
+
} else if (modelName.includes("Jamba 1.5 Mini")) {
|
169 |
+
tooltipText = "A compact variant of the Jamba model series.";
|
170 |
+
} else if (modelName.includes("Jamba 1.5 Large")) {
|
171 |
+
tooltipText = "An expanded variant of the Jamba model series.";
|
172 |
+
} else {
|
173 |
+
tooltipText = "A large language model from the FLaME evaluation benchmark.";
|
174 |
+
}
|
175 |
+
|
176 |
+
// Set the tooltip
|
177 |
+
cell.setAttribute('data-tooltip', tooltipText);
|
178 |
+
});
|
179 |
+
|
180 |
+
// After adding attributes, run the tooltip fix
|
181 |
+
if (window.fixProblemTooltips) {
|
182 |
+
window.fixProblemTooltips();
|
183 |
+
}
|
184 |
+
}
|
185 |
+
|
186 |
+
// Run on page load
|
187 |
+
setTimeout(fixAllModelTooltips, 500);
|
188 |
+
|
189 |
+
// Run when tabs are clicked
|
190 |
+
const tabs = document.querySelectorAll('.tabs li');
|
191 |
+
tabs.forEach(tab => {
|
192 |
+
tab.addEventListener('click', () => {
|
193 |
+
// Give time for content to be displayed
|
194 |
+
setTimeout(fixAllModelTooltips, 200);
|
195 |
+
});
|
196 |
+
});
|
197 |
+
});
|
198 |
+
EOF
|
199 |
+
|
200 |
+
# Add script inclusion to results.html if not already there
|
201 |
+
if ! grep -q "model-tooltips.js" "results.html"; then
|
202 |
+
# Add the script link before the closing body tag
|
203 |
+
sed -i 's/<\/body>/<script src="static\/js\/model-tooltips.js"><\/script>\n<\/body>/g' "results.html"
|
204 |
+
fi
|
205 |
+
|
206 |
+
# Add tooltip fix to ensure all tabs initialize properly
|
207 |
+
if ! grep -q "window.fixProblemTooltips" "results.html"; then
|
208 |
+
# Add call to fix all tooltips when tabs are clicked
|
209 |
+
sed -i '/document\.addEventListener.*DOMContentLoaded/a \
|
210 |
+
// Fix all tooltips in all tabs\
|
211 |
+
setTimeout(function() {\
|
212 |
+
if (window.fixProblemTooltips) {\
|
213 |
+
window.fixProblemTooltips();\
|
214 |
+
}\
|
215 |
+
}, 500);' "results.html"
|
216 |
+
fi
|
217 |
+
|
218 |
echo "Fixed tooltips in all HTML files"
|
information_retrieval_table.html
CHANGED
@@ -46,7 +46,7 @@
|
|
46 |
</thead>
|
47 |
<tbody>
|
48 |
<tr>
|
49 |
-
<td>Llama 3 70B Instruct</td>
|
50 |
<td class="has-text-centered">0.715</td>
|
51 |
<td class="has-text-centered">0.693</td>
|
52 |
<td class="has-text-centered">0.701</td>
|
@@ -69,7 +69,7 @@
|
|
69 |
<td class="has-text-centered">0.469</td>
|
70 |
</tr>
|
71 |
<tr>
|
72 |
-
<td>Llama 3 8B Instruct</td>
|
73 |
<td class="has-text-centered">0.581</td>
|
74 |
<td class="has-text-centered">0.558</td>
|
75 |
<td class="has-text-centered">0.565</td>
|
@@ -92,7 +92,7 @@
|
|
92 |
<td class="has-text-centered">0.350</td>
|
93 |
</tr>
|
94 |
<tr>
|
95 |
-
<td>DBRX Instruct</td>
|
96 |
<td class="has-text-centered">0.516</td>
|
97 |
<td class="has-text-centered">0.476</td>
|
98 |
<td class="has-text-centered">0.489</td>
|
@@ -115,7 +115,7 @@
|
|
115 |
<td class="has-text-centered">0.006</td>
|
116 |
</tr>
|
117 |
<tr>
|
118 |
-
<td>DeepSeek LLM (67B)</td>
|
119 |
<td class="has-text-centered">0.752</td>
|
120 |
<td class="has-text-centered">0.742</td>
|
121 |
<td class="has-text-centered">0.745</td>
|
@@ -138,7 +138,7 @@
|
|
138 |
<td class="has-text-centered">0.416</td>
|
139 |
</tr>
|
140 |
<tr>
|
141 |
-
<td>Gemma 2 27B</td>
|
142 |
<td class="has-text-centered">0.772</td>
|
143 |
<td class="has-text-centered">0.754</td>
|
144 |
<td class="has-text-centered">0.761</td>
|
@@ -161,7 +161,7 @@
|
|
161 |
<td class="has-text-centered">0.298</td>
|
162 |
</tr>
|
163 |
<tr>
|
164 |
-
<td>Gemma 2 9B</td>
|
165 |
<td class="has-text-centered">0.665</td>
|
166 |
<td class="has-text-centered">0.643</td>
|
167 |
<td class="has-text-centered">0.651</td>
|
@@ -184,7 +184,7 @@
|
|
184 |
<td class="has-text-centered">0.367</td>
|
185 |
</tr>
|
186 |
<tr>
|
187 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
188 |
<td class="has-text-centered">0.540</td>
|
189 |
<td class="has-text-centered">0.522</td>
|
190 |
<td class="has-text-centered">0.526</td>
|
@@ -207,7 +207,7 @@
|
|
207 |
<td class="has-text-centered">0.368</td>
|
208 |
</tr>
|
209 |
<tr>
|
210 |
-
<td>Mixtral-8x22B Instruct</td>
|
211 |
<td class="has-text-centered">0.653</td>
|
212 |
<td class="has-text-centered">0.625</td>
|
213 |
<td class="has-text-centered">0.635</td>
|
@@ -230,7 +230,7 @@
|
|
230 |
<td class="has-text-centered">0.435</td>
|
231 |
</tr>
|
232 |
<tr>
|
233 |
-
<td>Mixtral-8x7B Instruct</td>
|
234 |
<td class="has-text-centered">0.613</td>
|
235 |
<td class="has-text-centered">0.591</td>
|
236 |
<td class="has-text-centered">0.598</td>
|
@@ -253,7 +253,7 @@
|
|
253 |
<td class="has-text-centered">0.267</td>
|
254 |
</tr>
|
255 |
<tr>
|
256 |
-
<td>Qwen 2 Instruct (72B)</td>
|
257 |
<td class="has-text-centered">0.766</td>
|
258 |
<td class="has-text-centered">0.742</td>
|
259 |
<td class="has-text-centered">0.748</td>
|
@@ -276,7 +276,7 @@
|
|
276 |
<td class="has-text-centered">0.483</td>
|
277 |
</tr>
|
278 |
<tr>
|
279 |
-
<td>WizardLM-2 8x22B</td>
|
280 |
<td class="has-text-centered">0.755</td>
|
281 |
<td class="has-text-centered">0.741</td>
|
282 |
<td class="has-text-centered">0.744</td>
|
@@ -299,7 +299,7 @@
|
|
299 |
<td class="has-text-centered">0.226</td>
|
300 |
</tr>
|
301 |
<tr>
|
302 |
-
<td>DeepSeek-V3</td>
|
303 |
<td class="has-text-centered performance-medium">0.798</td>
|
304 |
<td class="has-text-centered performance-medium">0.787</td>
|
305 |
<td class="has-text-centered performance-medium">0.790</td>
|
@@ -322,7 +322,7 @@
|
|
322 |
<td class="has-text-centered">0.549</td>
|
323 |
</tr>
|
324 |
<tr>
|
325 |
-
<td>DeepSeek R1</td>
|
326 |
<td class="has-text-centered performance-best">0.813</td>
|
327 |
<td class="has-text-centered performance-best">0.805</td>
|
328 |
<td class="has-text-centered performance-best">0.807</td>
|
@@ -345,7 +345,7 @@
|
|
345 |
<td class="has-text-centered performance-medium">0.587</td>
|
346 |
</tr>
|
347 |
<tr>
|
348 |
-
<td>QwQ-32B-Preview</td>
|
349 |
<td class="has-text-centered">0.695</td>
|
350 |
<td class="has-text-centered">0.681</td>
|
351 |
<td class="has-text-centered">0.685</td>
|
@@ -368,7 +368,7 @@
|
|
368 |
<td class="has-text-centered">0.005</td>
|
369 |
</tr>
|
370 |
<tr>
|
371 |
-
<td>Jamba 1.5 Mini</td>
|
372 |
<td class="has-text-centered">0.564</td>
|
373 |
<td class="has-text-centered">0.556</td>
|
374 |
<td class="has-text-centered">0.552</td>
|
@@ -391,7 +391,7 @@
|
|
391 |
<td class="has-text-centered">0.132</td>
|
392 |
</tr>
|
393 |
<tr>
|
394 |
-
<td>Jamba 1.5 Large</td>
|
395 |
<td class="has-text-centered">0.707</td>
|
396 |
<td class="has-text-centered">0.687</td>
|
397 |
<td class="has-text-centered">0.693</td>
|
@@ -414,7 +414,7 @@
|
|
414 |
<td class="has-text-centered">0.397</td>
|
415 |
</tr>
|
416 |
<tr>
|
417 |
-
<td>Claude 3.5 Sonnet</td>
|
418 |
<td class="has-text-centered performance-strong">0.811</td>
|
419 |
<td class="has-text-centered performance-strong">0.794</td>
|
420 |
<td class="has-text-centered performance-strong">0.799</td>
|
@@ -437,7 +437,7 @@
|
|
437 |
<td class="has-text-centered performance-strong">0.655</td>
|
438 |
</tr>
|
439 |
<tr>
|
440 |
-
<td>Claude 3 Haiku</td>
|
441 |
<td class="has-text-centered">0.732</td>
|
442 |
<td class="has-text-centered">0.700</td>
|
443 |
<td class="has-text-centered">0.711</td>
|
@@ -460,7 +460,7 @@
|
|
460 |
<td class="has-text-centered">0.494</td>
|
461 |
</tr>
|
462 |
<tr>
|
463 |
-
<td>Cohere Command R +</td>
|
464 |
<td class="has-text-centered">0.769</td>
|
465 |
<td class="has-text-centered">0.750</td>
|
466 |
<td class="has-text-centered">0.756</td>
|
@@ -483,7 +483,7 @@
|
|
483 |
<td class="has-text-centered">0.452</td>
|
484 |
</tr>
|
485 |
<tr>
|
486 |
-
<td>Google Gemini 1.5 Pro</td>
|
487 |
<td class="has-text-centered">0.728</td>
|
488 |
<td class="has-text-centered">0.705</td>
|
489 |
<td class="has-text-centered">0.712</td>
|
@@ -506,7 +506,7 @@
|
|
506 |
<td class="has-text-centered">0.393</td>
|
507 |
</tr>
|
508 |
<tr>
|
509 |
-
<td>OpenAI gpt-4o</td>
|
510 |
<td class="has-text-centered">0.778</td>
|
511 |
<td class="has-text-centered">0.760</td>
|
512 |
<td class="has-text-centered">0.766</td>
|
@@ -529,7 +529,7 @@
|
|
529 |
<td class="has-text-centered">0.523</td>
|
530 |
</tr>
|
531 |
<tr>
|
532 |
-
<td>OpenAI o1-mini</td>
|
533 |
<td class="has-text-centered">0.772</td>
|
534 |
<td class="has-text-centered">0.755</td>
|
535 |
<td class="has-text-centered">0.761</td>
|
|
|
46 |
</thead>
|
47 |
<tbody>
|
48 |
<tr>
|
49 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
50 |
<td class="has-text-centered">0.715</td>
|
51 |
<td class="has-text-centered">0.693</td>
|
52 |
<td class="has-text-centered">0.701</td>
|
|
|
69 |
<td class="has-text-centered">0.469</td>
|
70 |
</tr>
|
71 |
<tr>
|
72 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
73 |
<td class="has-text-centered">0.581</td>
|
74 |
<td class="has-text-centered">0.558</td>
|
75 |
<td class="has-text-centered">0.565</td>
|
|
|
92 |
<td class="has-text-centered">0.350</td>
|
93 |
</tr>
|
94 |
<tr>
|
95 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
96 |
<td class="has-text-centered">0.516</td>
|
97 |
<td class="has-text-centered">0.476</td>
|
98 |
<td class="has-text-centered">0.489</td>
|
|
|
115 |
<td class="has-text-centered">0.006</td>
|
116 |
</tr>
|
117 |
<tr>
|
118 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
119 |
<td class="has-text-centered">0.752</td>
|
120 |
<td class="has-text-centered">0.742</td>
|
121 |
<td class="has-text-centered">0.745</td>
|
|
|
138 |
<td class="has-text-centered">0.416</td>
|
139 |
</tr>
|
140 |
<tr>
|
141 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
142 |
<td class="has-text-centered">0.772</td>
|
143 |
<td class="has-text-centered">0.754</td>
|
144 |
<td class="has-text-centered">0.761</td>
|
|
|
161 |
<td class="has-text-centered">0.298</td>
|
162 |
</tr>
|
163 |
<tr>
|
164 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
165 |
<td class="has-text-centered">0.665</td>
|
166 |
<td class="has-text-centered">0.643</td>
|
167 |
<td class="has-text-centered">0.651</td>
|
|
|
184 |
<td class="has-text-centered">0.367</td>
|
185 |
</tr>
|
186 |
<tr>
|
187 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
188 |
<td class="has-text-centered">0.540</td>
|
189 |
<td class="has-text-centered">0.522</td>
|
190 |
<td class="has-text-centered">0.526</td>
|
|
|
207 |
<td class="has-text-centered">0.368</td>
|
208 |
</tr>
|
209 |
<tr>
|
210 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
211 |
<td class="has-text-centered">0.653</td>
|
212 |
<td class="has-text-centered">0.625</td>
|
213 |
<td class="has-text-centered">0.635</td>
|
|
|
230 |
<td class="has-text-centered">0.435</td>
|
231 |
</tr>
|
232 |
<tr>
|
233 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
234 |
<td class="has-text-centered">0.613</td>
|
235 |
<td class="has-text-centered">0.591</td>
|
236 |
<td class="has-text-centered">0.598</td>
|
|
|
253 |
<td class="has-text-centered">0.267</td>
|
254 |
</tr>
|
255 |
<tr>
|
256 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
257 |
<td class="has-text-centered">0.766</td>
|
258 |
<td class="has-text-centered">0.742</td>
|
259 |
<td class="has-text-centered">0.748</td>
|
|
|
276 |
<td class="has-text-centered">0.483</td>
|
277 |
</tr>
|
278 |
<tr>
|
279 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
280 |
<td class="has-text-centered">0.755</td>
|
281 |
<td class="has-text-centered">0.741</td>
|
282 |
<td class="has-text-centered">0.744</td>
|
|
|
299 |
<td class="has-text-centered">0.226</td>
|
300 |
</tr>
|
301 |
<tr>
|
302 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
303 |
<td class="has-text-centered performance-medium">0.798</td>
|
304 |
<td class="has-text-centered performance-medium">0.787</td>
|
305 |
<td class="has-text-centered performance-medium">0.790</td>
|
|
|
322 |
<td class="has-text-centered">0.549</td>
|
323 |
</tr>
|
324 |
<tr>
|
325 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
326 |
<td class="has-text-centered performance-best">0.813</td>
|
327 |
<td class="has-text-centered performance-best">0.805</td>
|
328 |
<td class="has-text-centered performance-best">0.807</td>
|
|
|
345 |
<td class="has-text-centered performance-medium">0.587</td>
|
346 |
</tr>
|
347 |
<tr>
|
348 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
349 |
<td class="has-text-centered">0.695</td>
|
350 |
<td class="has-text-centered">0.681</td>
|
351 |
<td class="has-text-centered">0.685</td>
|
|
|
368 |
<td class="has-text-centered">0.005</td>
|
369 |
</tr>
|
370 |
<tr>
|
371 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
372 |
<td class="has-text-centered">0.564</td>
|
373 |
<td class="has-text-centered">0.556</td>
|
374 |
<td class="has-text-centered">0.552</td>
|
|
|
391 |
<td class="has-text-centered">0.132</td>
|
392 |
</tr>
|
393 |
<tr>
|
394 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
395 |
<td class="has-text-centered">0.707</td>
|
396 |
<td class="has-text-centered">0.687</td>
|
397 |
<td class="has-text-centered">0.693</td>
|
|
|
414 |
<td class="has-text-centered">0.397</td>
|
415 |
</tr>
|
416 |
<tr>
|
417 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
418 |
<td class="has-text-centered performance-strong">0.811</td>
|
419 |
<td class="has-text-centered performance-strong">0.794</td>
|
420 |
<td class="has-text-centered performance-strong">0.799</td>
|
|
|
437 |
<td class="has-text-centered performance-strong">0.655</td>
|
438 |
</tr>
|
439 |
<tr>
|
440 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
441 |
<td class="has-text-centered">0.732</td>
|
442 |
<td class="has-text-centered">0.700</td>
|
443 |
<td class="has-text-centered">0.711</td>
|
|
|
460 |
<td class="has-text-centered">0.494</td>
|
461 |
</tr>
|
462 |
<tr>
|
463 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
464 |
<td class="has-text-centered">0.769</td>
|
465 |
<td class="has-text-centered">0.750</td>
|
466 |
<td class="has-text-centered">0.756</td>
|
|
|
483 |
<td class="has-text-centered">0.452</td>
|
484 |
</tr>
|
485 |
<tr>
|
486 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
487 |
<td class="has-text-centered">0.728</td>
|
488 |
<td class="has-text-centered">0.705</td>
|
489 |
<td class="has-text-centered">0.712</td>
|
|
|
506 |
<td class="has-text-centered">0.393</td>
|
507 |
</tr>
|
508 |
<tr>
|
509 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
510 |
<td class="has-text-centered">0.778</td>
|
511 |
<td class="has-text-centered">0.760</td>
|
512 |
<td class="has-text-centered">0.766</td>
|
|
|
529 |
<td class="has-text-centered">0.523</td>
|
530 |
</tr>
|
531 |
<tr>
|
532 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
533 |
<td class="has-text-centered">0.772</td>
|
534 |
<td class="has-text-centered">0.755</td>
|
535 |
<td class="has-text-centered">0.761</td>
|
qa_table.html
CHANGED
@@ -25,139 +25,139 @@
|
|
25 |
</thead>
|
26 |
<tbody>
|
27 |
<tr>
|
28 |
-
<td>Llama 3 70B Instruct</td>
|
29 |
<td class="has-text-centered">0.809</td>
|
30 |
<td class="has-text-centered">0.709</td>
|
31 |
<td class="has-text-centered">0.772</td>
|
32 |
</tr>
|
33 |
<tr>
|
34 |
-
<td>Llama 3 8B Instruct</td>
|
35 |
<td class="has-text-centered">0.767</td>
|
36 |
<td class="has-text-centered">0.268</td>
|
37 |
<td class="has-text-centered">0.706</td>
|
38 |
</tr>
|
39 |
<tr>
|
40 |
-
<td>DBRX Instruct</td>
|
41 |
<td class="has-text-centered">0.738</td>
|
42 |
<td class="has-text-centered">0.252</td>
|
43 |
<td class="has-text-centered">0.633</td>
|
44 |
</tr>
|
45 |
<tr>
|
46 |
-
<td>DeepSeek LLM (67B)</td>
|
47 |
<td class="has-text-centered">0.742</td>
|
48 |
<td class="has-text-centered">0.174</td>
|
49 |
<td class="has-text-centered">0.355</td>
|
50 |
</tr>
|
51 |
<tr>
|
52 |
-
<td>Gemma 2 27B</td>
|
53 |
<td class="has-text-centered">0.768</td>
|
54 |
<td class="has-text-centered">0.268</td>
|
55 |
<td class="has-text-centered">0.734</td>
|
56 |
</tr>
|
57 |
<tr>
|
58 |
-
<td>Gemma 2 9B</td>
|
59 |
<td class="has-text-centered">0.779</td>
|
60 |
<td class="has-text-centered">0.292</td>
|
61 |
<td class="has-text-centered">0.750</td>
|
62 |
</tr>
|
63 |
<tr>
|
64 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
65 |
<td class="has-text-centered">0.655</td>
|
66 |
<td class="has-text-centered">0.199</td>
|
67 |
<td class="has-text-centered">0.553</td>
|
68 |
</tr>
|
69 |
<tr>
|
70 |
-
<td>Mixtral-8x22B Instruct</td>
|
71 |
<td class="has-text-centered">0.766</td>
|
72 |
<td class="has-text-centered">0.285</td>
|
73 |
<td class="has-text-centered">0.666</td>
|
74 |
</tr>
|
75 |
<tr>
|
76 |
-
<td>Mixtral-8x7B Instruct</td>
|
77 |
<td class="has-text-centered">0.611</td>
|
78 |
<td class="has-text-centered">0.315</td>
|
79 |
<td class="has-text-centered">0.501</td>
|
80 |
</tr>
|
81 |
<tr>
|
82 |
-
<td>Qwen 2 Instruct (72B)</td>
|
83 |
<td class="has-text-centered">0.819</td>
|
84 |
<td class="has-text-centered">0.269</td>
|
85 |
<td class="has-text-centered">0.715</td>
|
86 |
</tr>
|
87 |
<tr>
|
88 |
-
<td>WizardLM-2 8x22B</td>
|
89 |
<td class="has-text-centered">0.796</td>
|
90 |
<td class="has-text-centered">0.247</td>
|
91 |
<td class="has-text-centered">0.725</td>
|
92 |
</tr>
|
93 |
<tr>
|
94 |
-
<td>DeepSeek-V3</td>
|
95 |
<td class="has-text-centered performance-medium">0.840</td>
|
96 |
<td class="has-text-centered">0.261</td>
|
97 |
<td class="has-text-centered performance-low">0.779</td>
|
98 |
</tr>
|
99 |
<tr>
|
100 |
-
<td>DeepSeek R1</td>
|
101 |
<td class="has-text-centered performance-low">0.836</td>
|
102 |
<td class="has-text-centered performance-best">0.853</td>
|
103 |
<td class="has-text-centered performance-best">0.858</td>
|
104 |
</tr>
|
105 |
<tr>
|
106 |
-
<td>QwQ-32B-Preview</td>
|
107 |
<td class="has-text-centered">0.793</td>
|
108 |
<td class="has-text-centered">0.282</td>
|
109 |
<td class="has-text-centered performance-medium">0.796</td>
|
110 |
</tr>
|
111 |
<tr>
|
112 |
-
<td>Jamba 1.5 Mini</td>
|
113 |
<td class="has-text-centered">0.666</td>
|
114 |
<td class="has-text-centered">0.218</td>
|
115 |
<td class="has-text-centered">0.586</td>
|
116 |
</tr>
|
117 |
<tr>
|
118 |
-
<td>Jamba 1.5 Large</td>
|
119 |
<td class="has-text-centered">0.790</td>
|
120 |
<td class="has-text-centered">0.225</td>
|
121 |
<td class="has-text-centered">0.660</td>
|
122 |
</tr>
|
123 |
<tr>
|
124 |
-
<td>Claude 3.5 Sonnet</td>
|
125 |
<td class="has-text-centered performance-best">0.844</td>
|
126 |
<td class="has-text-centered">0.402</td>
|
127 |
<td class="has-text-centered">0.700</td>
|
128 |
</tr>
|
129 |
<tr>
|
130 |
-
<td>Claude 3 Haiku</td>
|
131 |
<td class="has-text-centered">0.803</td>
|
132 |
<td class="has-text-centered">0.421</td>
|
133 |
<td class="has-text-centered">0.733</td>
|
134 |
</tr>
|
135 |
<tr>
|
136 |
-
<td>Cohere Command R 7B</td>
|
137 |
<td class="has-text-centered">0.709</td>
|
138 |
<td class="has-text-centered">0.212</td>
|
139 |
<td class="has-text-centered">0.716</td>
|
140 |
</tr>
|
141 |
<tr>
|
142 |
-
<td>Cohere Command R +</td>
|
143 |
<td class="has-text-centered">0.776</td>
|
144 |
<td class="has-text-centered">0.259</td>
|
145 |
<td class="has-text-centered">0.698</td>
|
146 |
</tr>
|
147 |
<tr>
|
148 |
-
<td>Google Gemini 1.5 Pro</td>
|
149 |
<td class="has-text-centered">0.829</td>
|
150 |
<td class="has-text-centered">0.280</td>
|
151 |
<td class="has-text-centered">0.763</td>
|
152 |
</tr>
|
153 |
<tr>
|
154 |
-
<td>OpenAI gpt-4o</td>
|
155 |
<td class="has-text-centered performance-low">0.836</td>
|
156 |
<td class="has-text-centered performance-low">0.749</td>
|
157 |
<td class="has-text-centered">0.754</td>
|
158 |
</tr>
|
159 |
<tr>
|
160 |
-
<td>OpenAI o1-mini</td>
|
161 |
<td class="has-text-centered">0.799</td>
|
162 |
<td class="has-text-centered performance-medium">0.840</td>
|
163 |
<td class="has-text-centered">0.698</td>
|
|
|
25 |
</thead>
|
26 |
<tbody>
|
27 |
<tr>
|
28 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
29 |
<td class="has-text-centered">0.809</td>
|
30 |
<td class="has-text-centered">0.709</td>
|
31 |
<td class="has-text-centered">0.772</td>
|
32 |
</tr>
|
33 |
<tr>
|
34 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
35 |
<td class="has-text-centered">0.767</td>
|
36 |
<td class="has-text-centered">0.268</td>
|
37 |
<td class="has-text-centered">0.706</td>
|
38 |
</tr>
|
39 |
<tr>
|
40 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
41 |
<td class="has-text-centered">0.738</td>
|
42 |
<td class="has-text-centered">0.252</td>
|
43 |
<td class="has-text-centered">0.633</td>
|
44 |
</tr>
|
45 |
<tr>
|
46 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
47 |
<td class="has-text-centered">0.742</td>
|
48 |
<td class="has-text-centered">0.174</td>
|
49 |
<td class="has-text-centered">0.355</td>
|
50 |
</tr>
|
51 |
<tr>
|
52 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
53 |
<td class="has-text-centered">0.768</td>
|
54 |
<td class="has-text-centered">0.268</td>
|
55 |
<td class="has-text-centered">0.734</td>
|
56 |
</tr>
|
57 |
<tr>
|
58 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
59 |
<td class="has-text-centered">0.779</td>
|
60 |
<td class="has-text-centered">0.292</td>
|
61 |
<td class="has-text-centered">0.750</td>
|
62 |
</tr>
|
63 |
<tr>
|
64 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
65 |
<td class="has-text-centered">0.655</td>
|
66 |
<td class="has-text-centered">0.199</td>
|
67 |
<td class="has-text-centered">0.553</td>
|
68 |
</tr>
|
69 |
<tr>
|
70 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
71 |
<td class="has-text-centered">0.766</td>
|
72 |
<td class="has-text-centered">0.285</td>
|
73 |
<td class="has-text-centered">0.666</td>
|
74 |
</tr>
|
75 |
<tr>
|
76 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
77 |
<td class="has-text-centered">0.611</td>
|
78 |
<td class="has-text-centered">0.315</td>
|
79 |
<td class="has-text-centered">0.501</td>
|
80 |
</tr>
|
81 |
<tr>
|
82 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
83 |
<td class="has-text-centered">0.819</td>
|
84 |
<td class="has-text-centered">0.269</td>
|
85 |
<td class="has-text-centered">0.715</td>
|
86 |
</tr>
|
87 |
<tr>
|
88 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
89 |
<td class="has-text-centered">0.796</td>
|
90 |
<td class="has-text-centered">0.247</td>
|
91 |
<td class="has-text-centered">0.725</td>
|
92 |
</tr>
|
93 |
<tr>
|
94 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
95 |
<td class="has-text-centered performance-medium">0.840</td>
|
96 |
<td class="has-text-centered">0.261</td>
|
97 |
<td class="has-text-centered performance-low">0.779</td>
|
98 |
</tr>
|
99 |
<tr>
|
100 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
101 |
<td class="has-text-centered performance-low">0.836</td>
|
102 |
<td class="has-text-centered performance-best">0.853</td>
|
103 |
<td class="has-text-centered performance-best">0.858</td>
|
104 |
</tr>
|
105 |
<tr>
|
106 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
107 |
<td class="has-text-centered">0.793</td>
|
108 |
<td class="has-text-centered">0.282</td>
|
109 |
<td class="has-text-centered performance-medium">0.796</td>
|
110 |
</tr>
|
111 |
<tr>
|
112 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
113 |
<td class="has-text-centered">0.666</td>
|
114 |
<td class="has-text-centered">0.218</td>
|
115 |
<td class="has-text-centered">0.586</td>
|
116 |
</tr>
|
117 |
<tr>
|
118 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
119 |
<td class="has-text-centered">0.790</td>
|
120 |
<td class="has-text-centered">0.225</td>
|
121 |
<td class="has-text-centered">0.660</td>
|
122 |
</tr>
|
123 |
<tr>
|
124 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
125 |
<td class="has-text-centered performance-best">0.844</td>
|
126 |
<td class="has-text-centered">0.402</td>
|
127 |
<td class="has-text-centered">0.700</td>
|
128 |
</tr>
|
129 |
<tr>
|
130 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
131 |
<td class="has-text-centered">0.803</td>
|
132 |
<td class="has-text-centered">0.421</td>
|
133 |
<td class="has-text-centered">0.733</td>
|
134 |
</tr>
|
135 |
<tr>
|
136 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
137 |
<td class="has-text-centered">0.709</td>
|
138 |
<td class="has-text-centered">0.212</td>
|
139 |
<td class="has-text-centered">0.716</td>
|
140 |
</tr>
|
141 |
<tr>
|
142 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
143 |
<td class="has-text-centered">0.776</td>
|
144 |
<td class="has-text-centered">0.259</td>
|
145 |
<td class="has-text-centered">0.698</td>
|
146 |
</tr>
|
147 |
<tr>
|
148 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
149 |
<td class="has-text-centered">0.829</td>
|
150 |
<td class="has-text-centered">0.280</td>
|
151 |
<td class="has-text-centered">0.763</td>
|
152 |
</tr>
|
153 |
<tr>
|
154 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
155 |
<td class="has-text-centered performance-low">0.836</td>
|
156 |
<td class="has-text-centered performance-low">0.749</td>
|
157 |
<td class="has-text-centered">0.754</td>
|
158 |
</tr>
|
159 |
<tr>
|
160 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
161 |
<td class="has-text-centered">0.799</td>
|
162 |
<td class="has-text-centered performance-medium">0.840</td>
|
163 |
<td class="has-text-centered">0.698</td>
|
results.html
CHANGED
@@ -3135,5 +3135,6 @@
|
|
3135 |
<script src="static/js/tooltips.js"></script>
|
3136 |
<script src="static/js/fixed-tooltips.js"></script>
|
3137 |
<script src="static/js/tooltip-fix.js"></script>
|
|
|
3138 |
</body>
|
3139 |
</html>
|
|
|
3135 |
<script src="static/js/tooltips.js"></script>
|
3136 |
<script src="static/js/fixed-tooltips.js"></script>
|
3137 |
<script src="static/js/tooltip-fix.js"></script>
|
3138 |
+
<script src="static/js/model-tooltips.js"></script>
|
3139 |
</body>
|
3140 |
</html>
|
sentiment_analysis_table.html
CHANGED
@@ -35,7 +35,7 @@
|
|
35 |
</thead>
|
36 |
<tbody>
|
37 |
<tr>
|
38 |
-
<td>Llama 3 70B Instruct</td>
|
39 |
<td class="has-text-centered">0.123</td>
|
40 |
<td class="has-text-centered">0.290</td>
|
41 |
<td class="has-text-centered">0.272</td>
|
@@ -49,7 +49,7 @@
|
|
49 |
<td class="has-text-centered">0.573</td>
|
50 |
</tr>
|
51 |
<tr>
|
52 |
-
<td>Llama 3 8B Instruct</td>
|
53 |
<td class="has-text-centered">0.161</td>
|
54 |
<td class="has-text-centered">0.344</td>
|
55 |
<td class="has-text-centered">0.045</td>
|
@@ -63,7 +63,7 @@
|
|
63 |
<td class="has-text-centered">0.625</td>
|
64 |
</tr>
|
65 |
<tr>
|
66 |
-
<td>DBRX Instruct</td>
|
67 |
<td class="has-text-centered">0.160</td>
|
68 |
<td class="has-text-centered">0.321</td>
|
69 |
<td class="has-text-centered">0.052</td>
|
@@ -77,7 +77,7 @@
|
|
77 |
<td class="has-text-centered">0.541</td>
|
78 |
</tr>
|
79 |
<tr>
|
80 |
-
<td>DeepSeek LLM (67B)</td>
|
81 |
<td class="has-text-centered">0.118</td>
|
82 |
<td class="has-text-centered">0.278</td>
|
83 |
<td class="has-text-centered">0.302</td>
|
@@ -91,7 +91,7 @@
|
|
91 |
<td class="has-text-centered">0.544</td>
|
92 |
</tr>
|
93 |
<tr>
|
94 |
-
<td>Gemma 2 27B</td>
|
95 |
<td class="has-text-centered performance-best">0.100</td>
|
96 |
<td class="has-text-centered performance-best">0.266</td>
|
97 |
<td class="has-text-centered">0.406</td>
|
@@ -105,7 +105,7 @@
|
|
105 |
<td class="has-text-centered">0.524</td>
|
106 |
</tr>
|
107 |
<tr>
|
108 |
-
<td>Gemma 2 9B</td>
|
109 |
<td class="has-text-centered">0.189</td>
|
110 |
<td class="has-text-centered">0.352</td>
|
111 |
<td class="has-text-centered">-0.120</td>
|
@@ -119,7 +119,7 @@
|
|
119 |
<td class="has-text-centered">0.499</td>
|
120 |
</tr>
|
121 |
<tr>
|
122 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
123 |
<td class="has-text-centered">0.135</td>
|
124 |
<td class="has-text-centered">0.278</td>
|
125 |
<td class="has-text-centered">0.200</td>
|
@@ -133,7 +133,7 @@
|
|
133 |
<td class="has-text-centered">0.542</td>
|
134 |
</tr>
|
135 |
<tr>
|
136 |
-
<td>Mixtral-8x22B Instruct</td>
|
137 |
<td class="has-text-centered">0.221</td>
|
138 |
<td class="has-text-centered">0.364</td>
|
139 |
<td class="has-text-centered">-0.310</td>
|
@@ -147,7 +147,7 @@
|
|
147 |
<td class="has-text-centered">0.538</td>
|
148 |
</tr>
|
149 |
<tr>
|
150 |
-
<td>Mixtral-8x7B Instruct</td>
|
151 |
<td class="has-text-centered">0.208</td>
|
152 |
<td class="has-text-centered">0.307</td>
|
153 |
<td class="has-text-centered">-0.229</td>
|
@@ -161,7 +161,7 @@
|
|
161 |
<td class="has-text-centered">0.518</td>
|
162 |
</tr>
|
163 |
<tr>
|
164 |
-
<td>Qwen 2 Instruct (72B)</td>
|
165 |
<td class="has-text-centered">0.205</td>
|
166 |
<td class="has-text-centered">0.409</td>
|
167 |
<td class="has-text-centered">-0.212</td>
|
@@ -175,7 +175,7 @@
|
|
175 |
<td class="has-text-centered">0.601</td>
|
176 |
</tr>
|
177 |
<tr>
|
178 |
-
<td>WizardLM-2 8x22B</td>
|
179 |
<td class="has-text-centered">0.129</td>
|
180 |
<td class="has-text-centered">0.283</td>
|
181 |
<td class="has-text-centered">0.239</td>
|
@@ -189,7 +189,7 @@
|
|
189 |
<td class="has-text-centered">0.570</td>
|
190 |
</tr>
|
191 |
<tr>
|
192 |
-
<td>DeepSeek-V3</td>
|
193 |
<td class="has-text-centered">0.150</td>
|
194 |
<td class="has-text-centered">0.311</td>
|
195 |
<td class="has-text-centered">0.111</td>
|
@@ -203,7 +203,7 @@
|
|
203 |
<td class="has-text-centered">0.572</td>
|
204 |
</tr>
|
205 |
<tr>
|
206 |
-
<td>DeepSeek R1</td>
|
207 |
<td class="has-text-centered performance-low">0.110</td>
|
208 |
<td class="has-text-centered">0.289</td>
|
209 |
<td class="has-text-centered">0.348</td>
|
@@ -217,7 +217,7 @@
|
|
217 |
<td class="has-text-centered">0.489</td>
|
218 |
</tr>
|
219 |
<tr>
|
220 |
-
<td>QwQ-32B-Preview</td>
|
221 |
<td class="has-text-centered">0.141</td>
|
222 |
<td class="has-text-centered">0.290</td>
|
223 |
<td class="has-text-centered">0.165</td>
|
@@ -231,7 +231,7 @@
|
|
231 |
<td class="has-text-centered">0.534</td>
|
232 |
</tr>
|
233 |
<tr>
|
234 |
-
<td>Jamba 1.5 Mini</td>
|
235 |
<td class="has-text-centered performance-low">0.119</td>
|
236 |
<td class="has-text-centered">0.282</td>
|
237 |
<td class="has-text-centered">0.293</td>
|
@@ -245,7 +245,7 @@
|
|
245 |
<td class="has-text-centered">0.525</td>
|
246 |
</tr>
|
247 |
<tr>
|
248 |
-
<td>Jamba 1.5 Large</td>
|
249 |
<td class="has-text-centered">0.183</td>
|
250 |
<td class="has-text-centered">0.363</td>
|
251 |
<td class="has-text-centered">-0.085</td>
|
@@ -259,7 +259,7 @@
|
|
259 |
<td class="has-text-centered">0.573</td>
|
260 |
</tr>
|
261 |
<tr>
|
262 |
-
<td>Claude 3.5 Sonnet</td>
|
263 |
<td class="has-text-centered performance-low">0.101</td>
|
264 |
<td class="has-text-centered performance-low">0.268</td>
|
265 |
<td class="has-text-centered performance-best">0.402</td>
|
@@ -273,7 +273,7 @@
|
|
273 |
<td class="has-text-centered performance-medium">0.585</td>
|
274 |
</tr>
|
275 |
<tr>
|
276 |
-
<td>Claude 3 Haiku</td>
|
277 |
<td class="has-text-centered">0.167</td>
|
278 |
<td class="has-text-centered">0.349</td>
|
279 |
<td class="has-text-centered">0.008</td>
|
@@ -287,7 +287,7 @@
|
|
287 |
<td class="has-text-centered">0.538</td>
|
288 |
</tr>
|
289 |
<tr>
|
290 |
-
<td>Cohere Command R 7B</td>
|
291 |
<td class="has-text-centered">0.164</td>
|
292 |
<td class="has-text-centered">0.319</td>
|
293 |
<td class="has-text-centered">0.028</td>
|
@@ -301,7 +301,7 @@
|
|
301 |
<td class="has-text-centered">0.547</td>
|
302 |
</tr>
|
303 |
<tr>
|
304 |
-
<td>Cohere Command R +</td>
|
305 |
<td class="has-text-centered performance-medium">0.106</td>
|
306 |
<td class="has-text-centered">0.274</td>
|
307 |
<td class="has-text-centered performance-medium">0.373</td>
|
@@ -315,7 +315,7 @@
|
|
315 |
<td class="has-text-centered">0.547</td>
|
316 |
</tr>
|
317 |
<tr>
|
318 |
-
<td>Google Gemini 1.5 Pro</td>
|
319 |
<td class="has-text-centered">0.144</td>
|
320 |
<td class="has-text-centered">0.329</td>
|
321 |
<td class="has-text-centered">0.149</td>
|
@@ -329,7 +329,7 @@
|
|
329 |
<td class="has-text-centered performance-best">0.587</td>
|
330 |
</tr>
|
331 |
<tr>
|
332 |
-
<td>OpenAI gpt-4o</td>
|
333 |
<td class="has-text-centered">0.184</td>
|
334 |
<td class="has-text-centered">0.317</td>
|
335 |
<td class="has-text-centered">-0.089</td>
|
@@ -343,7 +343,7 @@
|
|
343 |
<td class="has-text-centered">0.515</td>
|
344 |
</tr>
|
345 |
<tr>
|
346 |
-
<td>OpenAI o1-mini</td>
|
347 |
<td class="has-text-centered performance-medium">0.120</td>
|
348 |
<td class="has-text-centered">0.295</td>
|
349 |
<td class="has-text-centered">0.289</td>
|
|
|
35 |
</thead>
|
36 |
<tbody>
|
37 |
<tr>
|
38 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
39 |
<td class="has-text-centered">0.123</td>
|
40 |
<td class="has-text-centered">0.290</td>
|
41 |
<td class="has-text-centered">0.272</td>
|
|
|
49 |
<td class="has-text-centered">0.573</td>
|
50 |
</tr>
|
51 |
<tr>
|
52 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
53 |
<td class="has-text-centered">0.161</td>
|
54 |
<td class="has-text-centered">0.344</td>
|
55 |
<td class="has-text-centered">0.045</td>
|
|
|
63 |
<td class="has-text-centered">0.625</td>
|
64 |
</tr>
|
65 |
<tr>
|
66 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
67 |
<td class="has-text-centered">0.160</td>
|
68 |
<td class="has-text-centered">0.321</td>
|
69 |
<td class="has-text-centered">0.052</td>
|
|
|
77 |
<td class="has-text-centered">0.541</td>
|
78 |
</tr>
|
79 |
<tr>
|
80 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
81 |
<td class="has-text-centered">0.118</td>
|
82 |
<td class="has-text-centered">0.278</td>
|
83 |
<td class="has-text-centered">0.302</td>
|
|
|
91 |
<td class="has-text-centered">0.544</td>
|
92 |
</tr>
|
93 |
<tr>
|
94 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
95 |
<td class="has-text-centered performance-best">0.100</td>
|
96 |
<td class="has-text-centered performance-best">0.266</td>
|
97 |
<td class="has-text-centered">0.406</td>
|
|
|
105 |
<td class="has-text-centered">0.524</td>
|
106 |
</tr>
|
107 |
<tr>
|
108 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
109 |
<td class="has-text-centered">0.189</td>
|
110 |
<td class="has-text-centered">0.352</td>
|
111 |
<td class="has-text-centered">-0.120</td>
|
|
|
119 |
<td class="has-text-centered">0.499</td>
|
120 |
</tr>
|
121 |
<tr>
|
122 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
123 |
<td class="has-text-centered">0.135</td>
|
124 |
<td class="has-text-centered">0.278</td>
|
125 |
<td class="has-text-centered">0.200</td>
|
|
|
133 |
<td class="has-text-centered">0.542</td>
|
134 |
</tr>
|
135 |
<tr>
|
136 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
137 |
<td class="has-text-centered">0.221</td>
|
138 |
<td class="has-text-centered">0.364</td>
|
139 |
<td class="has-text-centered">-0.310</td>
|
|
|
147 |
<td class="has-text-centered">0.538</td>
|
148 |
</tr>
|
149 |
<tr>
|
150 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
151 |
<td class="has-text-centered">0.208</td>
|
152 |
<td class="has-text-centered">0.307</td>
|
153 |
<td class="has-text-centered">-0.229</td>
|
|
|
161 |
<td class="has-text-centered">0.518</td>
|
162 |
</tr>
|
163 |
<tr>
|
164 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
165 |
<td class="has-text-centered">0.205</td>
|
166 |
<td class="has-text-centered">0.409</td>
|
167 |
<td class="has-text-centered">-0.212</td>
|
|
|
175 |
<td class="has-text-centered">0.601</td>
|
176 |
</tr>
|
177 |
<tr>
|
178 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
179 |
<td class="has-text-centered">0.129</td>
|
180 |
<td class="has-text-centered">0.283</td>
|
181 |
<td class="has-text-centered">0.239</td>
|
|
|
189 |
<td class="has-text-centered">0.570</td>
|
190 |
</tr>
|
191 |
<tr>
|
192 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
193 |
<td class="has-text-centered">0.150</td>
|
194 |
<td class="has-text-centered">0.311</td>
|
195 |
<td class="has-text-centered">0.111</td>
|
|
|
203 |
<td class="has-text-centered">0.572</td>
|
204 |
</tr>
|
205 |
<tr>
|
206 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
207 |
<td class="has-text-centered performance-low">0.110</td>
|
208 |
<td class="has-text-centered">0.289</td>
|
209 |
<td class="has-text-centered">0.348</td>
|
|
|
217 |
<td class="has-text-centered">0.489</td>
|
218 |
</tr>
|
219 |
<tr>
|
220 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
221 |
<td class="has-text-centered">0.141</td>
|
222 |
<td class="has-text-centered">0.290</td>
|
223 |
<td class="has-text-centered">0.165</td>
|
|
|
231 |
<td class="has-text-centered">0.534</td>
|
232 |
</tr>
|
233 |
<tr>
|
234 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
235 |
<td class="has-text-centered performance-low">0.119</td>
|
236 |
<td class="has-text-centered">0.282</td>
|
237 |
<td class="has-text-centered">0.293</td>
|
|
|
245 |
<td class="has-text-centered">0.525</td>
|
246 |
</tr>
|
247 |
<tr>
|
248 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
249 |
<td class="has-text-centered">0.183</td>
|
250 |
<td class="has-text-centered">0.363</td>
|
251 |
<td class="has-text-centered">-0.085</td>
|
|
|
259 |
<td class="has-text-centered">0.573</td>
|
260 |
</tr>
|
261 |
<tr>
|
262 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
263 |
<td class="has-text-centered performance-low">0.101</td>
|
264 |
<td class="has-text-centered performance-low">0.268</td>
|
265 |
<td class="has-text-centered performance-best">0.402</td>
|
|
|
273 |
<td class="has-text-centered performance-medium">0.585</td>
|
274 |
</tr>
|
275 |
<tr>
|
276 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
277 |
<td class="has-text-centered">0.167</td>
|
278 |
<td class="has-text-centered">0.349</td>
|
279 |
<td class="has-text-centered">0.008</td>
|
|
|
287 |
<td class="has-text-centered">0.538</td>
|
288 |
</tr>
|
289 |
<tr>
|
290 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
291 |
<td class="has-text-centered">0.164</td>
|
292 |
<td class="has-text-centered">0.319</td>
|
293 |
<td class="has-text-centered">0.028</td>
|
|
|
301 |
<td class="has-text-centered">0.547</td>
|
302 |
</tr>
|
303 |
<tr>
|
304 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
305 |
<td class="has-text-centered performance-medium">0.106</td>
|
306 |
<td class="has-text-centered">0.274</td>
|
307 |
<td class="has-text-centered performance-medium">0.373</td>
|
|
|
315 |
<td class="has-text-centered">0.547</td>
|
316 |
</tr>
|
317 |
<tr>
|
318 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
319 |
<td class="has-text-centered">0.144</td>
|
320 |
<td class="has-text-centered">0.329</td>
|
321 |
<td class="has-text-centered">0.149</td>
|
|
|
329 |
<td class="has-text-centered performance-best">0.587</td>
|
330 |
</tr>
|
331 |
<tr>
|
332 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
333 |
<td class="has-text-centered">0.184</td>
|
334 |
<td class="has-text-centered">0.317</td>
|
335 |
<td class="has-text-centered">-0.089</td>
|
|
|
343 |
<td class="has-text-centered">0.515</td>
|
344 |
</tr>
|
345 |
<tr>
|
346 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
347 |
<td class="has-text-centered performance-medium">0.120</td>
|
348 |
<td class="has-text-centered">0.295</td>
|
349 |
<td class="has-text-centered">0.289</td>
|
static/js/model-tooltips.js
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
document.addEventListener('DOMContentLoaded', function() {
|
2 |
+
// Fix model tooltips in all tabs
|
3 |
+
function fixAllModelTooltips() {
|
4 |
+
console.log("Fixing model tooltips in all tabs");
|
5 |
+
|
6 |
+
// Find all model name cells (first column in all tables)
|
7 |
+
const modelCells = document.querySelectorAll('td:first-child');
|
8 |
+
|
9 |
+
// Process each model cell
|
10 |
+
modelCells.forEach(cell => {
|
11 |
+
// Skip cells that already have tooltips
|
12 |
+
if (cell.classList.contains('tooltip-trigger')) {
|
13 |
+
return;
|
14 |
+
}
|
15 |
+
|
16 |
+
// Get the model name
|
17 |
+
const modelName = cell.textContent.trim();
|
18 |
+
|
19 |
+
// Add tooltip-trigger class and position style
|
20 |
+
cell.classList.add('tooltip-trigger');
|
21 |
+
cell.style.position = 'relative';
|
22 |
+
|
23 |
+
// Add data-title attribute with the model name
|
24 |
+
cell.setAttribute('data-title', modelName);
|
25 |
+
|
26 |
+
// Add descriptive tooltip based on model
|
27 |
+
let tooltipText = "";
|
28 |
+
|
29 |
+
// Set descriptive tooltip based on model name - exact descriptions from cost analysis tab
|
30 |
+
if (modelName.includes("GPT-4o") || modelName.includes("gpt-4o")) {
|
31 |
+
tooltipText = "OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.";
|
32 |
+
} else if (modelName.includes("o1-mini")) {
|
33 |
+
tooltipText = "OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.";
|
34 |
+
} else if (modelName.includes("Claude 3.5 Sonnet")) {
|
35 |
+
tooltipText = "Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.";
|
36 |
+
} else if (modelName.includes("Claude 3 Haiku")) {
|
37 |
+
tooltipText = "Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.";
|
38 |
+
} else if (modelName.includes("Gemini 1.5")) {
|
39 |
+
tooltipText = "Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.";
|
40 |
+
} else if (modelName.includes("Command R 7B")) {
|
41 |
+
tooltipText = "Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.";
|
42 |
+
} else if (modelName.includes("Command R +")) {
|
43 |
+
tooltipText = "Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.";
|
44 |
+
} else if (modelName.includes("DeepSeek R1")) {
|
45 |
+
tooltipText = "DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.";
|
46 |
+
} else if (modelName.includes("DeepSeek-V3") || modelName.includes("DeepSeek V3")) {
|
47 |
+
tooltipText = "DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.";
|
48 |
+
} else if (modelName.includes("DeepSeek LLM")) {
|
49 |
+
tooltipText = "DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.";
|
50 |
+
} else if (modelName.includes("Llama 3 70B")) {
|
51 |
+
tooltipText = "Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.";
|
52 |
+
} else if (modelName.includes("Llama 3 8B")) {
|
53 |
+
tooltipText = "Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.";
|
54 |
+
} else if (modelName.includes("DBRX")) {
|
55 |
+
tooltipText = "Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.";
|
56 |
+
} else if (modelName.includes("Mixtral-8x22B")) {
|
57 |
+
tooltipText = "Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.";
|
58 |
+
} else if (modelName.includes("Mixtral-8x7B")) {
|
59 |
+
tooltipText = "Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.";
|
60 |
+
} else if (modelName.includes("Mistral")) {
|
61 |
+
tooltipText = "Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.";
|
62 |
+
} else if (modelName.includes("Qwen 2")) {
|
63 |
+
tooltipText = "Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.";
|
64 |
+
} else if (modelName.includes("WizardLM")) {
|
65 |
+
tooltipText = "A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.";
|
66 |
+
} else if (modelName.includes("Gemma 2 27B")) {
|
67 |
+
tooltipText = "Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.";
|
68 |
+
} else if (modelName.includes("Gemma 2 9B")) {
|
69 |
+
tooltipText = "Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.";
|
70 |
+
} else if (modelName.includes("QwQ-32B")) {
|
71 |
+
tooltipText = "Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.";
|
72 |
+
} else if (modelName.includes("Jamba 1.5 Mini")) {
|
73 |
+
tooltipText = "A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.";
|
74 |
+
} else if (modelName.includes("Jamba 1.5 Large")) {
|
75 |
+
tooltipText = "An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.";
|
76 |
+
} else {
|
77 |
+
tooltipText = "A language model evaluated in the FLaME financial benchmark. Assessed across multiple financial NLP tasks including classification, summarization, QA, and more.";
|
78 |
+
}
|
79 |
+
|
80 |
+
// Set the tooltip
|
81 |
+
cell.setAttribute('data-tooltip', tooltipText);
|
82 |
+
});
|
83 |
+
|
84 |
+
// After adding attributes, run the tooltip fix
|
85 |
+
if (window.fixProblemTooltips) {
|
86 |
+
window.fixProblemTooltips();
|
87 |
+
}
|
88 |
+
}
|
89 |
+
|
90 |
+
// Run on page load
|
91 |
+
setTimeout(fixAllModelTooltips, 500);
|
92 |
+
|
93 |
+
// Run when tabs are clicked
|
94 |
+
const tabs = document.querySelectorAll('.tabs li');
|
95 |
+
tabs.forEach(tab => {
|
96 |
+
tab.addEventListener('click', () => {
|
97 |
+
// Give time for content to be displayed
|
98 |
+
setTimeout(fixAllModelTooltips, 200);
|
99 |
+
});
|
100 |
+
});
|
101 |
+
});
|
text_classification_table.html
CHANGED
@@ -43,7 +43,7 @@
|
|
43 |
</thead>
|
44 |
<tbody>
|
45 |
<tr>
|
46 |
-
<td>Llama 3 70B Instruct</td>
|
47 |
<td class="has-text-centered">0.660</td>
|
48 |
<td class="has-text-centered">0.748</td>
|
49 |
<td class="has-text-centered">0.660</td>
|
@@ -63,7 +63,7 @@
|
|
63 |
<td class="has-text-centered">0.811</td>
|
64 |
</tr>
|
65 |
<tr>
|
66 |
-
<td>Llama 3 8B Instruct</td>
|
67 |
<td class="has-text-centered">0.534</td>
|
68 |
<td class="has-text-centered">0.672</td>
|
69 |
<td class="has-text-centered">0.534</td>
|
@@ -83,7 +83,7 @@
|
|
83 |
<td class="has-text-centered">0.763</td>
|
84 |
</tr>
|
85 |
<tr>
|
86 |
-
<td>DBRX Instruct</td>
|
87 |
<td class="has-text-centered">0.578</td>
|
88 |
<td class="has-text-centered">0.706</td>
|
89 |
<td class="has-text-centered">0.578</td>
|
@@ -103,7 +103,7 @@
|
|
103 |
<td class="has-text-centered">0.746</td>
|
104 |
</tr>
|
105 |
<tr>
|
106 |
-
<td>DeepSeek LLM (67B)</td>
|
107 |
<td class="has-text-centered">0.596</td>
|
108 |
<td class="has-text-centered">0.711</td>
|
109 |
<td class="has-text-centered">0.596</td>
|
@@ -123,7 +123,7 @@
|
|
123 |
<td class="has-text-centered">0.778</td>
|
124 |
</tr>
|
125 |
<tr>
|
126 |
-
<td>Gemma 2 27B</td>
|
127 |
<td class="has-text-centered">0.639</td>
|
128 |
<td class="has-text-centered">0.730</td>
|
129 |
<td class="has-text-centered">0.639</td>
|
@@ -143,7 +143,7 @@
|
|
143 |
<td class="has-text-centered">0.808</td>
|
144 |
</tr>
|
145 |
<tr>
|
146 |
-
<td>Gemma 2 9B</td>
|
147 |
<td class="has-text-centered">0.630</td>
|
148 |
<td class="has-text-centered">0.710</td>
|
149 |
<td class="has-text-centered">0.630</td>
|
@@ -163,7 +163,7 @@
|
|
163 |
<td class="has-text-centered performance-best">0.856</td>
|
164 |
</tr>
|
165 |
<tr>
|
166 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
167 |
<td class="has-text-centered">0.547</td>
|
168 |
<td class="has-text-centered">0.677</td>
|
169 |
<td class="has-text-centered">0.547</td>
|
@@ -183,7 +183,7 @@
|
|
183 |
<td class="has-text-centered">0.779</td>
|
184 |
</tr>
|
185 |
<tr>
|
186 |
-
<td>Mixtral-8x22B Instruct</td>
|
187 |
<td class="has-text-centered">0.622</td>
|
188 |
<td class="has-text-centered">0.718</td>
|
189 |
<td class="has-text-centered">0.622</td>
|
@@ -203,7 +203,7 @@
|
|
203 |
<td class="has-text-centered performance-medium">0.835</td>
|
204 |
</tr>
|
205 |
<tr>
|
206 |
-
<td>Mixtral-8x7B Instruct</td>
|
207 |
<td class="has-text-centered">0.567</td>
|
208 |
<td class="has-text-centered">0.693</td>
|
209 |
<td class="has-text-centered">0.567</td>
|
@@ -223,7 +223,7 @@
|
|
223 |
<td class="has-text-centered">0.805</td>
|
224 |
</tr>
|
225 |
<tr>
|
226 |
-
<td>Qwen 2 Instruct (72B)</td>
|
227 |
<td class="has-text-centered">0.644</td>
|
228 |
<td class="has-text-centered">0.730</td>
|
229 |
<td class="has-text-centered">0.644</td>
|
@@ -243,7 +243,7 @@
|
|
243 |
<td class="has-text-centered">0.830</td>
|
244 |
</tr>
|
245 |
<tr>
|
246 |
-
<td>WizardLM-2 8x22B</td>
|
247 |
<td class="has-text-centered">0.664</td>
|
248 |
<td class="has-text-centered">0.737</td>
|
249 |
<td class="has-text-centered">0.664</td>
|
@@ -263,7 +263,7 @@
|
|
263 |
<td class="has-text-centered">0.797</td>
|
264 |
</tr>
|
265 |
<tr>
|
266 |
-
<td>DeepSeek-V3</td>
|
267 |
<td class="has-text-centered performance-strong">0.722</td>
|
268 |
<td class="has-text-centered performance-medium">0.774</td>
|
269 |
<td class="has-text-centered performance-strong">0.722</td>
|
@@ -283,7 +283,7 @@
|
|
283 |
<td class="has-text-centered">0.729</td>
|
284 |
</tr>
|
285 |
<tr>
|
286 |
-
<td>DeepSeek R1</td>
|
287 |
<td class="has-text-centered performance-best">0.772</td>
|
288 |
<td class="has-text-centered performance-strong">0.789</td>
|
289 |
<td class="has-text-centered performance-best">0.772</td>
|
@@ -303,7 +303,7 @@
|
|
303 |
<td class="has-text-centered">0.769</td>
|
304 |
</tr>
|
305 |
<tr>
|
306 |
-
<td>QwQ-32B-Preview</td>
|
307 |
<td class="has-text-centered">0.577</td>
|
308 |
<td class="has-text-centered">0.747</td>
|
309 |
<td class="has-text-centered">0.577</td>
|
@@ -323,7 +323,7 @@
|
|
323 |
<td class="has-text-centered">0.744</td>
|
324 |
</tr>
|
325 |
<tr>
|
326 |
-
<td>Jamba 1.5 Mini</td>
|
327 |
<td class="has-text-centered">0.528</td>
|
328 |
<td class="has-text-centered">0.630</td>
|
329 |
<td class="has-text-centered">0.528</td>
|
@@ -343,7 +343,7 @@
|
|
343 |
<td class="has-text-centered">0.682</td>
|
344 |
</tr>
|
345 |
<tr>
|
346 |
-
<td>Jamba 1.5 Large</td>
|
347 |
<td class="has-text-centered">0.642</td>
|
348 |
<td class="has-text-centered">0.746</td>
|
349 |
<td class="has-text-centered">0.642</td>
|
@@ -363,7 +363,7 @@
|
|
363 |
<td class="has-text-centered">0.782</td>
|
364 |
</tr>
|
365 |
<tr>
|
366 |
-
<td>Claude 3.5 Sonnet</td>
|
367 |
<td class="has-text-centered">0.682</td>
|
368 |
<td class="has-text-centered">0.755</td>
|
369 |
<td class="has-text-centered">0.682</td>
|
@@ -383,7 +383,7 @@
|
|
383 |
<td class="has-text-centered">0.827</td>
|
384 |
</tr>
|
385 |
<tr>
|
386 |
-
<td>Claude 3 Haiku</td>
|
387 |
<td class="has-text-centered">0.639</td>
|
388 |
<td class="has-text-centered">0.735</td>
|
389 |
<td class="has-text-centered">0.639</td>
|
@@ -403,7 +403,7 @@
|
|
403 |
<td class="has-text-centered">0.781</td>
|
404 |
</tr>
|
405 |
<tr>
|
406 |
-
<td>Cohere Command R 7B</td>
|
407 |
<td class="has-text-centered">0.530</td>
|
408 |
<td class="has-text-centered">0.650</td>
|
409 |
<td class="has-text-centered">0.530</td>
|
@@ -423,7 +423,7 @@
|
|
423 |
<td class="has-text-centered">0.770</td>
|
424 |
</tr>
|
425 |
<tr>
|
426 |
-
<td>Cohere Command R +</td>
|
427 |
<td class="has-text-centered">0.660</td>
|
428 |
<td class="has-text-centered">0.747</td>
|
429 |
<td class="has-text-centered">0.660</td>
|
@@ -443,7 +443,7 @@
|
|
443 |
<td class="has-text-centered">0.812</td>
|
444 |
</tr>
|
445 |
<tr>
|
446 |
-
<td>Google Gemini 1.5 Pro</td>
|
447 |
<td class="has-text-centered">0.483</td>
|
448 |
<td class="has-text-centered">0.487</td>
|
449 |
<td class="has-text-centered">0.483</td>
|
@@ -463,7 +463,7 @@
|
|
463 |
<td class="has-text-centered performance-strong">0.837</td>
|
464 |
</tr>
|
465 |
<tr>
|
466 |
-
<td>OpenAI gpt-4o</td>
|
467 |
<td class="has-text-centered performance-medium">0.704</td>
|
468 |
<td class="has-text-centered performance-best">0.792</td>
|
469 |
<td class="has-text-centered performance-medium">0.704</td>
|
@@ -483,7 +483,7 @@
|
|
483 |
<td class="has-text-centered">0.824</td>
|
484 |
</tr>
|
485 |
<tr>
|
486 |
-
<td>OpenAI o1-mini</td>
|
487 |
<td class="has-text-centered">0.681</td>
|
488 |
<td class="has-text-centered">0.760</td>
|
489 |
<td class="has-text-centered">0.681</td>
|
|
|
43 |
</thead>
|
44 |
<tbody>
|
45 |
<tr>
|
46 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
47 |
<td class="has-text-centered">0.660</td>
|
48 |
<td class="has-text-centered">0.748</td>
|
49 |
<td class="has-text-centered">0.660</td>
|
|
|
63 |
<td class="has-text-centered">0.811</td>
|
64 |
</tr>
|
65 |
<tr>
|
66 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
67 |
<td class="has-text-centered">0.534</td>
|
68 |
<td class="has-text-centered">0.672</td>
|
69 |
<td class="has-text-centered">0.534</td>
|
|
|
83 |
<td class="has-text-centered">0.763</td>
|
84 |
</tr>
|
85 |
<tr>
|
86 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
87 |
<td class="has-text-centered">0.578</td>
|
88 |
<td class="has-text-centered">0.706</td>
|
89 |
<td class="has-text-centered">0.578</td>
|
|
|
103 |
<td class="has-text-centered">0.746</td>
|
104 |
</tr>
|
105 |
<tr>
|
106 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
107 |
<td class="has-text-centered">0.596</td>
|
108 |
<td class="has-text-centered">0.711</td>
|
109 |
<td class="has-text-centered">0.596</td>
|
|
|
123 |
<td class="has-text-centered">0.778</td>
|
124 |
</tr>
|
125 |
<tr>
|
126 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
127 |
<td class="has-text-centered">0.639</td>
|
128 |
<td class="has-text-centered">0.730</td>
|
129 |
<td class="has-text-centered">0.639</td>
|
|
|
143 |
<td class="has-text-centered">0.808</td>
|
144 |
</tr>
|
145 |
<tr>
|
146 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
147 |
<td class="has-text-centered">0.630</td>
|
148 |
<td class="has-text-centered">0.710</td>
|
149 |
<td class="has-text-centered">0.630</td>
|
|
|
163 |
<td class="has-text-centered performance-best">0.856</td>
|
164 |
</tr>
|
165 |
<tr>
|
166 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
167 |
<td class="has-text-centered">0.547</td>
|
168 |
<td class="has-text-centered">0.677</td>
|
169 |
<td class="has-text-centered">0.547</td>
|
|
|
183 |
<td class="has-text-centered">0.779</td>
|
184 |
</tr>
|
185 |
<tr>
|
186 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
187 |
<td class="has-text-centered">0.622</td>
|
188 |
<td class="has-text-centered">0.718</td>
|
189 |
<td class="has-text-centered">0.622</td>
|
|
|
203 |
<td class="has-text-centered performance-medium">0.835</td>
|
204 |
</tr>
|
205 |
<tr>
|
206 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
207 |
<td class="has-text-centered">0.567</td>
|
208 |
<td class="has-text-centered">0.693</td>
|
209 |
<td class="has-text-centered">0.567</td>
|
|
|
223 |
<td class="has-text-centered">0.805</td>
|
224 |
</tr>
|
225 |
<tr>
|
226 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
227 |
<td class="has-text-centered">0.644</td>
|
228 |
<td class="has-text-centered">0.730</td>
|
229 |
<td class="has-text-centered">0.644</td>
|
|
|
243 |
<td class="has-text-centered">0.830</td>
|
244 |
</tr>
|
245 |
<tr>
|
246 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
247 |
<td class="has-text-centered">0.664</td>
|
248 |
<td class="has-text-centered">0.737</td>
|
249 |
<td class="has-text-centered">0.664</td>
|
|
|
263 |
<td class="has-text-centered">0.797</td>
|
264 |
</tr>
|
265 |
<tr>
|
266 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
267 |
<td class="has-text-centered performance-strong">0.722</td>
|
268 |
<td class="has-text-centered performance-medium">0.774</td>
|
269 |
<td class="has-text-centered performance-strong">0.722</td>
|
|
|
283 |
<td class="has-text-centered">0.729</td>
|
284 |
</tr>
|
285 |
<tr>
|
286 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
287 |
<td class="has-text-centered performance-best">0.772</td>
|
288 |
<td class="has-text-centered performance-strong">0.789</td>
|
289 |
<td class="has-text-centered performance-best">0.772</td>
|
|
|
303 |
<td class="has-text-centered">0.769</td>
|
304 |
</tr>
|
305 |
<tr>
|
306 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
307 |
<td class="has-text-centered">0.577</td>
|
308 |
<td class="has-text-centered">0.747</td>
|
309 |
<td class="has-text-centered">0.577</td>
|
|
|
323 |
<td class="has-text-centered">0.744</td>
|
324 |
</tr>
|
325 |
<tr>
|
326 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
327 |
<td class="has-text-centered">0.528</td>
|
328 |
<td class="has-text-centered">0.630</td>
|
329 |
<td class="has-text-centered">0.528</td>
|
|
|
343 |
<td class="has-text-centered">0.682</td>
|
344 |
</tr>
|
345 |
<tr>
|
346 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
347 |
<td class="has-text-centered">0.642</td>
|
348 |
<td class="has-text-centered">0.746</td>
|
349 |
<td class="has-text-centered">0.642</td>
|
|
|
363 |
<td class="has-text-centered">0.782</td>
|
364 |
</tr>
|
365 |
<tr>
|
366 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
367 |
<td class="has-text-centered">0.682</td>
|
368 |
<td class="has-text-centered">0.755</td>
|
369 |
<td class="has-text-centered">0.682</td>
|
|
|
383 |
<td class="has-text-centered">0.827</td>
|
384 |
</tr>
|
385 |
<tr>
|
386 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
387 |
<td class="has-text-centered">0.639</td>
|
388 |
<td class="has-text-centered">0.735</td>
|
389 |
<td class="has-text-centered">0.639</td>
|
|
|
403 |
<td class="has-text-centered">0.781</td>
|
404 |
</tr>
|
405 |
<tr>
|
406 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
407 |
<td class="has-text-centered">0.530</td>
|
408 |
<td class="has-text-centered">0.650</td>
|
409 |
<td class="has-text-centered">0.530</td>
|
|
|
423 |
<td class="has-text-centered">0.770</td>
|
424 |
</tr>
|
425 |
<tr>
|
426 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
427 |
<td class="has-text-centered">0.660</td>
|
428 |
<td class="has-text-centered">0.747</td>
|
429 |
<td class="has-text-centered">0.660</td>
|
|
|
443 |
<td class="has-text-centered">0.812</td>
|
444 |
</tr>
|
445 |
<tr>
|
446 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
447 |
<td class="has-text-centered">0.483</td>
|
448 |
<td class="has-text-centered">0.487</td>
|
449 |
<td class="has-text-centered">0.483</td>
|
|
|
463 |
<td class="has-text-centered performance-strong">0.837</td>
|
464 |
</tr>
|
465 |
<tr>
|
466 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
467 |
<td class="has-text-centered performance-medium">0.704</td>
|
468 |
<td class="has-text-centered performance-best">0.792</td>
|
469 |
<td class="has-text-centered performance-medium">0.704</td>
|
|
|
483 |
<td class="has-text-centered">0.824</td>
|
484 |
</tr>
|
485 |
<tr>
|
486 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
487 |
<td class="has-text-centered">0.681</td>
|
488 |
<td class="has-text-centered">0.760</td>
|
489 |
<td class="has-text-centered">0.681</td>
|
text_summarization_table.html
CHANGED
@@ -29,7 +29,7 @@
|
|
29 |
</thead>
|
30 |
<tbody>
|
31 |
<tr>
|
32 |
-
<td>Llama 3 70B Instruct</td>
|
33 |
<td class="has-text-centered">0.715</td>
|
34 |
<td class="has-text-centered">0.801</td>
|
35 |
<td class="has-text-centered">0.754</td>
|
@@ -38,7 +38,7 @@
|
|
38 |
<td class="has-text-centered performance-strong">0.817</td>
|
39 |
</tr>
|
40 |
<tr>
|
41 |
-
<td>Llama 3 8B Instruct</td>
|
42 |
<td class="has-text-centered">0.724</td>
|
43 |
<td class="has-text-centered">0.796</td>
|
44 |
<td class="has-text-centered">0.757</td>
|
@@ -47,7 +47,7 @@
|
|
47 |
<td class="has-text-centered">0.811</td>
|
48 |
</tr>
|
49 |
<tr>
|
50 |
-
<td>DBRX Instruct</td>
|
51 |
<td class="has-text-centered">0.680</td>
|
52 |
<td class="has-text-centered">0.786</td>
|
53 |
<td class="has-text-centered">0.729</td>
|
@@ -56,7 +56,7 @@
|
|
56 |
<td class="has-text-centered">0.806</td>
|
57 |
</tr>
|
58 |
<tr>
|
59 |
-
<td>DeepSeek LLM (67B)</td>
|
60 |
<td class="has-text-centered">0.692</td>
|
61 |
<td class="has-text-centered">0.678</td>
|
62 |
<td class="has-text-centered">0.681</td>
|
@@ -65,7 +65,7 @@
|
|
65 |
<td class="has-text-centered">0.807</td>
|
66 |
</tr>
|
67 |
<tr>
|
68 |
-
<td>Gemma 2 27B</td>
|
69 |
<td class="has-text-centered">0.680</td>
|
70 |
<td class="has-text-centered">0.777</td>
|
71 |
<td class="has-text-centered">0.723</td>
|
@@ -74,7 +74,7 @@
|
|
74 |
<td class="has-text-centered">0.814</td>
|
75 |
</tr>
|
76 |
<tr>
|
77 |
-
<td>Gemma 2 9B</td>
|
78 |
<td class="has-text-centered">0.651</td>
|
79 |
<td class="has-text-centered">0.531</td>
|
80 |
<td class="has-text-centered">0.585</td>
|
@@ -83,7 +83,7 @@
|
|
83 |
<td class="has-text-centered performance-strong">0.817</td>
|
84 |
</tr>
|
85 |
<tr>
|
86 |
-
<td>Mistral (7B) Instruct v0.3</td>
|
87 |
<td class="has-text-centered">0.702</td>
|
88 |
<td class="has-text-centered performance-strong">0.806</td>
|
89 |
<td class="has-text-centered">0.750</td>
|
@@ -92,7 +92,7 @@
|
|
92 |
<td class="has-text-centered">0.811</td>
|
93 |
</tr>
|
94 |
<tr>
|
95 |
-
<td>Mixtral-8x22B Instruct</td>
|
96 |
<td class="has-text-centered">0.713</td>
|
97 |
<td class="has-text-centered performance-best">0.812</td>
|
98 |
<td class="has-text-centered">0.758</td>
|
@@ -101,7 +101,7 @@
|
|
101 |
<td class="has-text-centered">0.815</td>
|
102 |
</tr>
|
103 |
<tr>
|
104 |
-
<td>Mixtral-8x7B Instruct</td>
|
105 |
<td class="has-text-centered">0.727</td>
|
106 |
<td class="has-text-centered">0.773</td>
|
107 |
<td class="has-text-centered">0.747</td>
|
@@ -110,7 +110,7 @@
|
|
110 |
<td class="has-text-centered">0.810</td>
|
111 |
</tr>
|
112 |
<tr>
|
113 |
-
<td>Qwen 2 Instruct (72B)</td>
|
114 |
<td class="has-text-centered">0.709</td>
|
115 |
<td class="has-text-centered performance-medium">0.804</td>
|
116 |
<td class="has-text-centered">0.752</td>
|
@@ -119,7 +119,7 @@
|
|
119 |
<td class="has-text-centered">0.811</td>
|
120 |
</tr>
|
121 |
<tr>
|
122 |
-
<td>WizardLM-2 8x22B</td>
|
123 |
<td class="has-text-centered">0.677</td>
|
124 |
<td class="has-text-centered performance-strong">0.806</td>
|
125 |
<td class="has-text-centered">0.735</td>
|
@@ -128,7 +128,7 @@
|
|
128 |
<td class="has-text-centered">0.808</td>
|
129 |
</tr>
|
130 |
<tr>
|
131 |
-
<td>DeepSeek-V3</td>
|
132 |
<td class="has-text-centered">0.703</td>
|
133 |
<td class="has-text-centered performance-strong">0.806</td>
|
134 |
<td class="has-text-centered">0.750</td>
|
@@ -137,7 +137,7 @@
|
|
137 |
<td class="has-text-centered">0.815</td>
|
138 |
</tr>
|
139 |
<tr>
|
140 |
-
<td>DeepSeek R1</td>
|
141 |
<td class="has-text-centered">0.724</td>
|
142 |
<td class="has-text-centered">0.800</td>
|
143 |
<td class="has-text-centered">0.759</td>
|
@@ -146,7 +146,7 @@
|
|
146 |
<td class="has-text-centered">0.804</td>
|
147 |
</tr>
|
148 |
<tr>
|
149 |
-
<td>QwQ-32B-Preview</td>
|
150 |
<td class="has-text-centered">0.653</td>
|
151 |
<td class="has-text-centered">0.751</td>
|
152 |
<td class="has-text-centered">0.696</td>
|
@@ -155,7 +155,7 @@
|
|
155 |
<td class="has-text-centered performance-strong">0.817</td>
|
156 |
</tr>
|
157 |
<tr>
|
158 |
-
<td>Jamba 1.5 Mini</td>
|
159 |
<td class="has-text-centered">0.692</td>
|
160 |
<td class="has-text-centered">0.798</td>
|
161 |
<td class="has-text-centered">0.741</td>
|
@@ -164,7 +164,7 @@
|
|
164 |
<td class="has-text-centered performance-medium">0.816</td>
|
165 |
</tr>
|
166 |
<tr>
|
167 |
-
<td>Jamba 1.5 Large</td>
|
168 |
<td class="has-text-centered">0.679</td>
|
169 |
<td class="has-text-centered">0.800</td>
|
170 |
<td class="has-text-centered">0.734</td>
|
@@ -173,7 +173,7 @@
|
|
173 |
<td class="has-text-centered performance-best">0.818</td>
|
174 |
</tr>
|
175 |
<tr>
|
176 |
-
<td>Claude 3.5 Sonnet</td>
|
177 |
<td class="has-text-centered performance-medium">0.737</td>
|
178 |
<td class="has-text-centered">0.802</td>
|
179 |
<td class="has-text-centered performance-medium">0.767</td>
|
@@ -182,7 +182,7 @@
|
|
182 |
<td class="has-text-centered">0.813</td>
|
183 |
</tr>
|
184 |
<tr>
|
185 |
-
<td>Claude 3 Haiku</td>
|
186 |
<td class="has-text-centered">0.683</td>
|
187 |
<td class="has-text-centered">0.617</td>
|
188 |
<td class="has-text-centered">0.646</td>
|
@@ -191,7 +191,7 @@
|
|
191 |
<td class="has-text-centered">0.808</td>
|
192 |
</tr>
|
193 |
<tr>
|
194 |
-
<td>Cohere Command R 7B</td>
|
195 |
<td class="has-text-centered">0.724</td>
|
196 |
<td class="has-text-centered">0.781</td>
|
197 |
<td class="has-text-centered">0.750</td>
|
@@ -200,7 +200,7 @@
|
|
200 |
<td class="has-text-centered">0.815</td>
|
201 |
</tr>
|
202 |
<tr>
|
203 |
-
<td>Cohere Command R +</td>
|
204 |
<td class="has-text-centered">0.724</td>
|
205 |
<td class="has-text-centered">0.782</td>
|
206 |
<td class="has-text-centered">0.751</td>
|
@@ -209,7 +209,7 @@
|
|
209 |
<td class="has-text-centered">0.810</td>
|
210 |
</tr>
|
211 |
<tr>
|
212 |
-
<td>Google Gemini 1.5 Pro</td>
|
213 |
<td class="has-text-centered performance-best">0.757</td>
|
214 |
<td class="has-text-centered">0.800</td>
|
215 |
<td class="has-text-centered performance-best">0.777</td>
|
@@ -218,7 +218,7 @@
|
|
218 |
<td class="has-text-centered performance-strong">0.817</td>
|
219 |
</tr>
|
220 |
<tr>
|
221 |
-
<td>OpenAI gpt-4o</td>
|
222 |
<td class="has-text-centered performance-strong">0.755</td>
|
223 |
<td class="has-text-centered">0.793</td>
|
224 |
<td class="has-text-centered performance-strong">0.773</td>
|
@@ -227,7 +227,7 @@
|
|
227 |
<td class="has-text-centered performance-medium">0.816</td>
|
228 |
</tr>
|
229 |
<tr>
|
230 |
-
<td>OpenAI o1-mini</td>
|
231 |
<td class="has-text-centered">0.731</td>
|
232 |
<td class="has-text-centered">0.801</td>
|
233 |
<td class="has-text-centered">0.763</td>
|
|
|
29 |
</thead>
|
30 |
<tbody>
|
31 |
<tr>
|
32 |
+
<td class="tooltip-trigger" data-title="Llama 3 70B Instruct" data-tooltip="Meta's advanced 70 billion parameter dense language model optimized for instruction-following tasks. Available through Together AI and notable for complex reasoning capabilities.">Llama 3 70B Instruct</td>
|
33 |
<td class="has-text-centered">0.715</td>
|
34 |
<td class="has-text-centered">0.801</td>
|
35 |
<td class="has-text-centered">0.754</td>
|
|
|
38 |
<td class="has-text-centered performance-strong">0.817</td>
|
39 |
</tr>
|
40 |
<tr>
|
41 |
+
<td class="tooltip-trigger" data-title="Llama 3 8B Instruct" data-tooltip="Meta's efficient 8 billion parameter language model optimized for instruction-following. Balances performance and efficiency for financial tasks with reasonable reasoning capabilities.">Llama 3 8B Instruct</td>
|
42 |
<td class="has-text-centered">0.724</td>
|
43 |
<td class="has-text-centered">0.796</td>
|
44 |
<td class="has-text-centered">0.757</td>
|
|
|
47 |
<td class="has-text-centered">0.811</td>
|
48 |
</tr>
|
49 |
<tr>
|
50 |
+
<td class="tooltip-trigger" data-title="DBRX Instruct" data-tooltip="Databricks' 132 billion parameter Mixture of Experts (MoE) model focused on advanced reasoning. Demonstrates competitive performance on financial tasks with strong text processing capabilities.">DBRX Instruct</td>
|
51 |
<td class="has-text-centered">0.680</td>
|
52 |
<td class="has-text-centered">0.786</td>
|
53 |
<td class="has-text-centered">0.729</td>
|
|
|
56 |
<td class="has-text-centered">0.806</td>
|
57 |
</tr>
|
58 |
<tr>
|
59 |
+
<td class="tooltip-trigger" data-title="DeepSeek LLM (67B)" data-tooltip="DeepSeek's 67 billion parameter model optimized for chat applications. Balances performance and efficiency across financial tasks with solid reasoning capabilities.">DeepSeek LLM (67B)</td>
|
60 |
<td class="has-text-centered">0.692</td>
|
61 |
<td class="has-text-centered">0.678</td>
|
62 |
<td class="has-text-centered">0.681</td>
|
|
|
65 |
<td class="has-text-centered">0.807</td>
|
66 |
</tr>
|
67 |
<tr>
|
68 |
+
<td class="tooltip-trigger" data-title="Gemma 2 27B" data-tooltip="Google's open-weight 27 billion parameter model optimized for reasoning tasks. Balances performance and efficiency across financial domains with strong instruction-following.">Gemma 2 27B</td>
|
69 |
<td class="has-text-centered">0.680</td>
|
70 |
<td class="has-text-centered">0.777</td>
|
71 |
<td class="has-text-centered">0.723</td>
|
|
|
74 |
<td class="has-text-centered">0.814</td>
|
75 |
</tr>
|
76 |
<tr>
|
77 |
+
<td class="tooltip-trigger" data-title="Gemma 2 9B" data-tooltip="Google's efficient open-weight 9 billion parameter model. Demonstrates good performance on financial tasks relative to its smaller size.">Gemma 2 9B</td>
|
78 |
<td class="has-text-centered">0.651</td>
|
79 |
<td class="has-text-centered">0.531</td>
|
80 |
<td class="has-text-centered">0.585</td>
|
|
|
83 |
<td class="has-text-centered performance-strong">0.817</td>
|
84 |
</tr>
|
85 |
<tr>
|
86 |
+
<td class="tooltip-trigger" data-title="Mistral (7B) Instruct v0.3" data-tooltip="Mistral AI's 7 billion parameter instruction-tuned model. Demonstrates impressive efficiency with reasonable performance on financial tasks despite its smaller size.">Mistral (7B) Instruct v0.3</td>
|
87 |
<td class="has-text-centered">0.702</td>
|
88 |
<td class="has-text-centered performance-strong">0.806</td>
|
89 |
<td class="has-text-centered">0.750</td>
|
|
|
92 |
<td class="has-text-centered">0.811</td>
|
93 |
</tr>
|
94 |
<tr>
|
95 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x22B Instruct" data-tooltip="Mistral AI's 141 billion parameter MoE model with eight 22B expert networks. Features robust reasoning capabilities for financial tasks with strong instruction-following performance.">Mixtral-8x22B Instruct</td>
|
96 |
<td class="has-text-centered">0.713</td>
|
97 |
<td class="has-text-centered performance-best">0.812</td>
|
98 |
<td class="has-text-centered">0.758</td>
|
|
|
101 |
<td class="has-text-centered">0.815</td>
|
102 |
</tr>
|
103 |
<tr>
|
104 |
+
<td class="tooltip-trigger" data-title="Mixtral-8x7B Instruct" data-tooltip="Mistral AI's 47 billion parameter MoE model with eight 7B expert networks. Balances efficiency and performance with reasonable financial reasoning capabilities.">Mixtral-8x7B Instruct</td>
|
105 |
<td class="has-text-centered">0.727</td>
|
106 |
<td class="has-text-centered">0.773</td>
|
107 |
<td class="has-text-centered">0.747</td>
|
|
|
110 |
<td class="has-text-centered">0.810</td>
|
111 |
</tr>
|
112 |
<tr>
|
113 |
+
<td class="tooltip-trigger" data-title="Qwen 2 Instruct (72B)" data-tooltip="Alibaba's 72 billion parameter instruction-following model optimized for reasoning tasks. Features strong performance on financial domains with advanced text processing capabilities.">Qwen 2 Instruct (72B)</td>
|
114 |
<td class="has-text-centered">0.709</td>
|
115 |
<td class="has-text-centered performance-medium">0.804</td>
|
116 |
<td class="has-text-centered">0.752</td>
|
|
|
119 |
<td class="has-text-centered">0.811</td>
|
120 |
</tr>
|
121 |
<tr>
|
122 |
+
<td class="tooltip-trigger" data-title="WizardLM-2 8x22B" data-tooltip="A 176 billion parameter MoE model focused on complex reasoning. Designed for advanced instruction-following with strong capabilities across financial tasks.">WizardLM-2 8x22B</td>
|
123 |
<td class="has-text-centered">0.677</td>
|
124 |
<td class="has-text-centered performance-strong">0.806</td>
|
125 |
<td class="has-text-centered">0.735</td>
|
|
|
128 |
<td class="has-text-centered">0.808</td>
|
129 |
</tr>
|
130 |
<tr>
|
131 |
+
<td class="tooltip-trigger" data-title="DeepSeek-V3" data-tooltip="DeepSeek's 685 billion parameter Mixture of Experts (MoE) model optimized for advanced reasoning. Strong performance on financial tasks with robust instruction-following capabilities.">DeepSeek-V3</td>
|
132 |
<td class="has-text-centered">0.703</td>
|
133 |
<td class="has-text-centered performance-strong">0.806</td>
|
134 |
<td class="has-text-centered">0.750</td>
|
|
|
137 |
<td class="has-text-centered">0.815</td>
|
138 |
</tr>
|
139 |
<tr>
|
140 |
+
<td class="tooltip-trigger" data-title="DeepSeek R1" data-tooltip="DeepSeek's premium 671 billion parameter Mixture of Experts (MoE) model representing their most advanced offering. Designed for state-of-the-art performance across complex reasoning and financial tasks.">DeepSeek R1</td>
|
141 |
<td class="has-text-centered">0.724</td>
|
142 |
<td class="has-text-centered">0.800</td>
|
143 |
<td class="has-text-centered">0.759</td>
|
|
|
146 |
<td class="has-text-centered">0.804</td>
|
147 |
</tr>
|
148 |
<tr>
|
149 |
+
<td class="tooltip-trigger" data-title="QwQ-32B-Preview" data-tooltip="Qwen's experimental 32 billion parameter MoE model focused on efficient computation. Features interesting performance characteristics on certain financial tasks.">QwQ-32B-Preview</td>
|
150 |
<td class="has-text-centered">0.653</td>
|
151 |
<td class="has-text-centered">0.751</td>
|
152 |
<td class="has-text-centered">0.696</td>
|
|
|
155 |
<td class="has-text-centered performance-strong">0.817</td>
|
156 |
</tr>
|
157 |
<tr>
|
158 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Mini" data-tooltip="A compact variant in the Jamba model series focused on efficiency. Balances performance and computational requirements for financial tasks.">Jamba 1.5 Mini</td>
|
159 |
<td class="has-text-centered">0.692</td>
|
160 |
<td class="has-text-centered">0.798</td>
|
161 |
<td class="has-text-centered">0.741</td>
|
|
|
164 |
<td class="has-text-centered performance-medium">0.816</td>
|
165 |
</tr>
|
166 |
<tr>
|
167 |
+
<td class="tooltip-trigger" data-title="Jamba 1.5 Large" data-tooltip="An expanded variant in the Jamba model series with enhanced capabilities. Features stronger reasoning for financial tasks than its smaller counterpart.">Jamba 1.5 Large</td>
|
168 |
<td class="has-text-centered">0.679</td>
|
169 |
<td class="has-text-centered">0.800</td>
|
170 |
<td class="has-text-centered">0.734</td>
|
|
|
173 |
<td class="has-text-centered performance-best">0.818</td>
|
174 |
</tr>
|
175 |
<tr>
|
176 |
+
<td class="tooltip-trigger" data-title="Claude 3.5 Sonnet" data-tooltip="Anthropic's advanced proprietary language model optimized for complex reasoning and instruction-following. Features enhanced performance on financial tasks with strong text processing capabilities.">Claude 3.5 Sonnet</td>
|
177 |
<td class="has-text-centered performance-medium">0.737</td>
|
178 |
<td class="has-text-centered">0.802</td>
|
179 |
<td class="has-text-centered performance-medium">0.767</td>
|
|
|
182 |
<td class="has-text-centered">0.813</td>
|
183 |
</tr>
|
184 |
<tr>
|
185 |
+
<td class="tooltip-trigger" data-title="Claude 3 Haiku" data-tooltip="Anthropic's smaller efficiency-focused model in the Claude family. Designed for speed and lower computational requirements while maintaining reasonable performance on financial tasks.">Claude 3 Haiku</td>
|
186 |
<td class="has-text-centered">0.683</td>
|
187 |
<td class="has-text-centered">0.617</td>
|
188 |
<td class="has-text-centered">0.646</td>
|
|
|
191 |
<td class="has-text-centered">0.808</td>
|
192 |
</tr>
|
193 |
<tr>
|
194 |
+
<td class="tooltip-trigger" data-title="Cohere Command R 7B" data-tooltip="Cohere's 7-billion parameter model focused on instruction-following. An efficient model with reasonable financial domain capabilities for its size.">Cohere Command R 7B</td>
|
195 |
<td class="has-text-centered">0.724</td>
|
196 |
<td class="has-text-centered">0.781</td>
|
197 |
<td class="has-text-centered">0.750</td>
|
|
|
200 |
<td class="has-text-centered">0.815</td>
|
201 |
</tr>
|
202 |
<tr>
|
203 |
+
<td class="tooltip-trigger" data-title="Cohere Command R +" data-tooltip="Cohere's enhanced command model with improved instruction-following capabilities. Features advanced reasoning for financial domains with stronger performance than its smaller counterpart.">Cohere Command R +</td>
|
204 |
<td class="has-text-centered">0.724</td>
|
205 |
<td class="has-text-centered">0.782</td>
|
206 |
<td class="has-text-centered">0.751</td>
|
|
|
209 |
<td class="has-text-centered">0.810</td>
|
210 |
</tr>
|
211 |
<tr>
|
212 |
+
<td class="tooltip-trigger" data-title="Google Gemini 1.5 Pro" data-tooltip="Google's advanced proprietary multimodal model designed for complex reasoning and instruction-following tasks. Features strong performance across financial domains with advanced reasoning capabilities.">Google Gemini 1.5 Pro</td>
|
213 |
<td class="has-text-centered performance-best">0.757</td>
|
214 |
<td class="has-text-centered">0.800</td>
|
215 |
<td class="has-text-centered performance-best">0.777</td>
|
|
|
218 |
<td class="has-text-centered performance-strong">0.817</td>
|
219 |
</tr>
|
220 |
<tr>
|
221 |
+
<td class="tooltip-trigger" data-title="OpenAI gpt-4o" data-tooltip="OpenAI's flagship multimodal model optimized for a balance of quality and speed. Features strong performance across diverse tasks with capabilities for complex financial reasoning and instruction following.">OpenAI gpt-4o</td>
|
222 |
<td class="has-text-centered performance-strong">0.755</td>
|
223 |
<td class="has-text-centered">0.793</td>
|
224 |
<td class="has-text-centered performance-strong">0.773</td>
|
|
|
227 |
<td class="has-text-centered performance-medium">0.816</td>
|
228 |
</tr>
|
229 |
<tr>
|
230 |
+
<td class="tooltip-trigger" data-title="OpenAI o1-mini" data-tooltip="OpenAI's smaller advanced model balancing efficiency and performance. Demonstrates surprisingly strong results on financial tasks despite its reduced parameter count.">OpenAI o1-mini</td>
|
231 |
<td class="has-text-centered">0.731</td>
|
232 |
<td class="has-text-centered">0.801</td>
|
233 |
<td class="has-text-centered">0.763</td>
|