Spaces:
Running
Running
tables and figures
Browse files- FLaME/content/figures/fig_methodology_domain.jpg +0 -0
- FLaME/content/figures/fig_methodology_tasks.jpg +0 -0
- FLaME/content/figures/fig_overview_flow.jpg +0 -0
- causal_analysis_table.html +287 -0
- final_results.html +0 -0
- index.html +44 -36
- information_retrieval_table.html +555 -0
- manual_integration.md +36 -0
- qa_table.html +166 -0
- results.html +0 -0
- sentiment_analysis_table.html +360 -0
- static/css/index.css +51 -0
- static/css/results.css +84 -0
- static/js/index.js +27 -0
- static/js/results.js +34 -0
- text_classification_table.html +506 -0
- text_summarization_table.html +239 -0
FLaME/content/figures/fig_methodology_domain.jpg
ADDED
![]() |
FLaME/content/figures/fig_methodology_tasks.jpg
ADDED
![]() |
FLaME/content/figures/fig_overview_flow.jpg
ADDED
![]() |
causal_analysis_table.html
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- Causal Analysis -->
|
2 |
+
<div id="causal-analysis" class="tab-content">
|
3 |
+
<h2 class="title is-4">Causal Analysis Task Results</h2>
|
4 |
+
<div class="results-table">
|
5 |
+
<table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth">
|
6 |
+
<thead>
|
7 |
+
<tr>
|
8 |
+
<th rowspan="2">Model</th>
|
9 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="For text sections identified as causal, this task extracts the Cause and Effect spans, handling both unicausal and multicausal cases in financial texts.">Causal Detection (CD)</th>
|
10 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="Determines if a given financial text section contains a causal relation, labeled as 1 if causal and 0 otherwise.">Causal Classification (CC)</th>
|
11 |
+
</tr>
|
12 |
+
<tr>
|
13 |
+
<th class="has-text-centered">Accuracy</th>
|
14 |
+
<th class="has-text-centered">Precision</th>
|
15 |
+
<th class="has-text-centered">Recall</th>
|
16 |
+
<th class="has-text-centered">F1</th>
|
17 |
+
<th class="has-text-centered">Precision</th>
|
18 |
+
<th class="has-text-centered">Recall</th>
|
19 |
+
<th class="has-text-centered">F1</th>
|
20 |
+
<th class="has-text-centered">Accuracy</th>
|
21 |
+
</tr>
|
22 |
+
</thead>
|
23 |
+
<tbody>
|
24 |
+
<tr>
|
25 |
+
<td>Llama 3 70B Instruct</td>
|
26 |
+
<td class="has-text-centered">0.148</td>
|
27 |
+
<td class="has-text-centered">0.429</td>
|
28 |
+
<td class="has-text-centered">0.148</td>
|
29 |
+
<td class="has-text-centered">0.142</td>
|
30 |
+
<td class="has-text-centered">0.241</td>
|
31 |
+
<td class="has-text-centered">0.329</td>
|
32 |
+
<td class="has-text-centered">0.192</td>
|
33 |
+
<td class="has-text-centered">0.198</td>
|
34 |
+
</tr>
|
35 |
+
<tr>
|
36 |
+
<td>Llama 3 8B Instruct</td>
|
37 |
+
<td class="has-text-centered">0.097</td>
|
38 |
+
<td class="has-text-centered">0.341</td>
|
39 |
+
<td class="has-text-centered">0.097</td>
|
40 |
+
<td class="has-text-centered">0.049</td>
|
41 |
+
<td class="has-text-centered">0.232</td>
|
42 |
+
<td class="has-text-centered">0.241</td>
|
43 |
+
<td class="has-text-centered">0.234</td>
|
44 |
+
<td class="has-text-centered performance-strong">0.380</td>
|
45 |
+
</tr>
|
46 |
+
<tr>
|
47 |
+
<td>DBRX Instruct</td>
|
48 |
+
<td class="has-text-centered">0.078</td>
|
49 |
+
<td class="has-text-centered">0.521</td>
|
50 |
+
<td class="has-text-centered">0.078</td>
|
51 |
+
<td class="has-text-centered">0.087</td>
|
52 |
+
<td class="has-text-centered">0.276</td>
|
53 |
+
<td class="has-text-centered">0.313</td>
|
54 |
+
<td class="has-text-centered">0.231</td>
|
55 |
+
<td class="has-text-centered">0.235</td>
|
56 |
+
</tr>
|
57 |
+
<tr>
|
58 |
+
<td>DeepSeek LLM (67B)</td>
|
59 |
+
<td class="has-text-centered">0.026</td>
|
60 |
+
<td class="has-text-centered">0.214</td>
|
61 |
+
<td class="has-text-centered">0.026</td>
|
62 |
+
<td class="has-text-centered">0.025</td>
|
63 |
+
<td class="has-text-centered">0.141</td>
|
64 |
+
<td class="has-text-centered">0.328</td>
|
65 |
+
<td class="has-text-centered">0.193</td>
|
66 |
+
<td class="has-text-centered">0.221</td>
|
67 |
+
</tr>
|
68 |
+
<tr>
|
69 |
+
<td>Gemma 2 27B</td>
|
70 |
+
<td class="has-text-centered">0.115</td>
|
71 |
+
<td class="has-text-centered">0.510</td>
|
72 |
+
<td class="has-text-centered">0.115</td>
|
73 |
+
<td class="has-text-centered">0.133</td>
|
74 |
+
<td class="has-text-centered">0.309</td>
|
75 |
+
<td class="has-text-centered">0.310</td>
|
76 |
+
<td class="has-text-centered">0.242</td>
|
77 |
+
<td class="has-text-centered">0.262</td>
|
78 |
+
</tr>
|
79 |
+
<tr>
|
80 |
+
<td>Gemma 2 9B</td>
|
81 |
+
<td class="has-text-centered">0.115</td>
|
82 |
+
<td class="has-text-centered">0.394</td>
|
83 |
+
<td class="has-text-centered">0.115</td>
|
84 |
+
<td class="has-text-centered">0.105</td>
|
85 |
+
<td class="has-text-centered">0.275</td>
|
86 |
+
<td class="has-text-centered">0.294</td>
|
87 |
+
<td class="has-text-centered">0.207</td>
|
88 |
+
<td class="has-text-centered">0.258</td>
|
89 |
+
</tr>
|
90 |
+
<tr>
|
91 |
+
<td>Mistral (7B) Instruct v0.3</td>
|
92 |
+
<td class="has-text-centered">0.078</td>
|
93 |
+
<td class="has-text-centered">0.455</td>
|
94 |
+
<td class="has-text-centered">0.078</td>
|
95 |
+
<td class="has-text-centered">0.052</td>
|
96 |
+
<td class="has-text-centered">0.339</td>
|
97 |
+
<td class="has-text-centered performance-best">0.361</td>
|
98 |
+
<td class="has-text-centered">0.227</td>
|
99 |
+
<td class="has-text-centered">0.258</td>
|
100 |
+
</tr>
|
101 |
+
<tr>
|
102 |
+
<td>Mixtral-8x22B Instruct</td>
|
103 |
+
<td class="has-text-centered">0.131</td>
|
104 |
+
<td class="has-text-centered">0.486</td>
|
105 |
+
<td class="has-text-centered">0.131</td>
|
106 |
+
<td class="has-text-centered">0.125</td>
|
107 |
+
<td class="has-text-centered">0.344</td>
|
108 |
+
<td class="has-text-centered">0.310</td>
|
109 |
+
<td class="has-text-centered performance-best">0.308</td>
|
110 |
+
<td class="has-text-centered performance-medium">0.318</td>
|
111 |
+
</tr>
|
112 |
+
<tr>
|
113 |
+
<td>Mixtral-8x7B Instruct</td>
|
114 |
+
<td class="has-text-centered">0.088</td>
|
115 |
+
<td class="has-text-centered">0.510</td>
|
116 |
+
<td class="has-text-centered">0.088</td>
|
117 |
+
<td class="has-text-centered">0.055</td>
|
118 |
+
<td class="has-text-centered">0.308</td>
|
119 |
+
<td class="has-text-centered">0.314</td>
|
120 |
+
<td class="has-text-centered">0.229</td>
|
121 |
+
<td class="has-text-centered">0.273</td>
|
122 |
+
</tr>
|
123 |
+
<tr>
|
124 |
+
<td>Qwen 2 Instruct (72B)</td>
|
125 |
+
<td class="has-text-centered">0.139</td>
|
126 |
+
<td class="has-text-centered">0.489</td>
|
127 |
+
<td class="has-text-centered">0.139</td>
|
128 |
+
<td class="has-text-centered">0.190</td>
|
129 |
+
<td class="has-text-centered">0.208</td>
|
130 |
+
<td class="has-text-centered">0.330</td>
|
131 |
+
<td class="has-text-centered">0.184</td>
|
132 |
+
<td class="has-text-centered">0.188</td>
|
133 |
+
</tr>
|
134 |
+
<tr>
|
135 |
+
<td>WizardLM-2 8x22B</td>
|
136 |
+
<td class="has-text-centered">0.076</td>
|
137 |
+
<td class="has-text-centered">0.453</td>
|
138 |
+
<td class="has-text-centered">0.076</td>
|
139 |
+
<td class="has-text-centered">0.114</td>
|
140 |
+
<td class="has-text-centered">0.263</td>
|
141 |
+
<td class="has-text-centered">0.347</td>
|
142 |
+
<td class="has-text-centered">0.201</td>
|
143 |
+
<td class="has-text-centered">0.237</td>
|
144 |
+
</tr>
|
145 |
+
<tr>
|
146 |
+
<td>DeepSeek-V3</td>
|
147 |
+
<td class="has-text-centered">0.164</td>
|
148 |
+
<td class="has-text-centered">0.528</td>
|
149 |
+
<td class="has-text-centered">0.164</td>
|
150 |
+
<td class="has-text-centered performance-medium">0.198</td>
|
151 |
+
<td class="has-text-centered">0.194</td>
|
152 |
+
<td class="has-text-centered">0.327</td>
|
153 |
+
<td class="has-text-centered">0.170</td>
|
154 |
+
<td class="has-text-centered">0.248</td>
|
155 |
+
</tr>
|
156 |
+
<tr>
|
157 |
+
<td>DeepSeek R1</td>
|
158 |
+
<td class="has-text-centered performance-best">0.245</td>
|
159 |
+
<td class="has-text-centered performance-strong">0.643</td>
|
160 |
+
<td class="has-text-centered performance-best">0.245</td>
|
161 |
+
<td class="has-text-centered performance-best">0.337</td>
|
162 |
+
<td class="has-text-centered performance-best">0.385</td>
|
163 |
+
<td class="has-text-centered">0.318</td>
|
164 |
+
<td class="has-text-centered">0.202</td>
|
165 |
+
<td class="has-text-centered">0.221</td>
|
166 |
+
</tr>
|
167 |
+
<tr>
|
168 |
+
<td>QwQ-32B-Preview</td>
|
169 |
+
<td class="has-text-centered">0.110</td>
|
170 |
+
<td class="has-text-centered">0.473</td>
|
171 |
+
<td class="has-text-centered">0.110</td>
|
172 |
+
<td class="has-text-centered">0.131</td>
|
173 |
+
<td class="has-text-centered">0.193</td>
|
174 |
+
<td class="has-text-centered">0.262</td>
|
175 |
+
<td class="has-text-centered">0.220</td>
|
176 |
+
<td class="has-text-centered performance-best">0.465</td>
|
177 |
+
</tr>
|
178 |
+
<tr>
|
179 |
+
<td>Jamba 1.5 Mini</td>
|
180 |
+
<td class="has-text-centered">0.050</td>
|
181 |
+
<td class="has-text-centered">0.280</td>
|
182 |
+
<td class="has-text-centered">0.050</td>
|
183 |
+
<td class="has-text-centered">0.043</td>
|
184 |
+
<td class="has-text-centered">0.323</td>
|
185 |
+
<td class="has-text-centered">0.283</td>
|
186 |
+
<td class="has-text-centered performance-strong">0.270</td>
|
187 |
+
<td class="has-text-centered">0.295</td>
|
188 |
+
</tr>
|
189 |
+
<tr>
|
190 |
+
<td>Jamba 1.5 Large</td>
|
191 |
+
<td class="has-text-centered">0.076</td>
|
192 |
+
<td class="has-text-centered">0.517</td>
|
193 |
+
<td class="has-text-centered">0.076</td>
|
194 |
+
<td class="has-text-centered">0.074</td>
|
195 |
+
<td class="has-text-centered">0.268</td>
|
196 |
+
<td class="has-text-centered">0.248</td>
|
197 |
+
<td class="has-text-centered">0.176</td>
|
198 |
+
<td class="has-text-centered">0.200</td>
|
199 |
+
</tr>
|
200 |
+
<tr>
|
201 |
+
<td>Claude 3.5 Sonnet</td>
|
202 |
+
<td class="has-text-centered">0.154</td>
|
203 |
+
<td class="has-text-centered">0.564</td>
|
204 |
+
<td class="has-text-centered">0.154</td>
|
205 |
+
<td class="has-text-centered">0.196</td>
|
206 |
+
<td class="has-text-centered">0.259</td>
|
207 |
+
<td class="has-text-centered">0.336</td>
|
208 |
+
<td class="has-text-centered">0.197</td>
|
209 |
+
<td class="has-text-centered">0.235</td>
|
210 |
+
</tr>
|
211 |
+
<tr>
|
212 |
+
<td>Claude 3 Haiku</td>
|
213 |
+
<td class="has-text-centered">0.082</td>
|
214 |
+
<td class="has-text-centered">0.388</td>
|
215 |
+
<td class="has-text-centered">0.082</td>
|
216 |
+
<td class="has-text-centered">0.081</td>
|
217 |
+
<td class="has-text-centered performance-medium">0.369</td>
|
218 |
+
<td class="has-text-centered">0.347</td>
|
219 |
+
<td class="has-text-centered">0.200</td>
|
220 |
+
<td class="has-text-centered">0.203</td>
|
221 |
+
</tr>
|
222 |
+
<tr>
|
223 |
+
<td>Cohere Command R 7B</td>
|
224 |
+
<td class="has-text-centered">0.089</td>
|
225 |
+
<td class="has-text-centered">0.363</td>
|
226 |
+
<td class="has-text-centered">0.089</td>
|
227 |
+
<td class="has-text-centered">0.057</td>
|
228 |
+
<td class="has-text-centered performance-strong">0.379</td>
|
229 |
+
<td class="has-text-centered performance-medium">0.356</td>
|
230 |
+
<td class="has-text-centered performance-medium">0.255</td>
|
231 |
+
<td class="has-text-centered">0.275</td>
|
232 |
+
</tr>
|
233 |
+
<tr>
|
234 |
+
<td>Cohere Command R +</td>
|
235 |
+
<td class="has-text-centered">0.090</td>
|
236 |
+
<td class="has-text-centered">0.453</td>
|
237 |
+
<td class="has-text-centered">0.090</td>
|
238 |
+
<td class="has-text-centered">0.080</td>
|
239 |
+
<td class="has-text-centered">0.353</td>
|
240 |
+
<td class="has-text-centered">0.336</td>
|
241 |
+
<td class="has-text-centered">0.238</td>
|
242 |
+
<td class="has-text-centered">0.265</td>
|
243 |
+
</tr>
|
244 |
+
<tr>
|
245 |
+
<td>Google Gemini 1.5 Pro</td>
|
246 |
+
<td class="has-text-centered performance-medium">0.165</td>
|
247 |
+
<td class="has-text-centered">0.514</td>
|
248 |
+
<td class="has-text-centered performance-medium">0.165</td>
|
249 |
+
<td class="has-text-centered">0.196</td>
|
250 |
+
<td class="has-text-centered">0.265</td>
|
251 |
+
<td class="has-text-centered performance-strong">0.357</td>
|
252 |
+
<td class="has-text-centered">0.217</td>
|
253 |
+
<td class="has-text-centered">0.258</td>
|
254 |
+
</tr>
|
255 |
+
<tr>
|
256 |
+
<td>OpenAI gpt-4o</td>
|
257 |
+
<td class="has-text-centered">0.082</td>
|
258 |
+
<td class="has-text-centered performance-medium">0.576</td>
|
259 |
+
<td class="has-text-centered">0.082</td>
|
260 |
+
<td class="has-text-centered">0.130</td>
|
261 |
+
<td class="has-text-centered">0.254</td>
|
262 |
+
<td class="has-text-centered">0.327</td>
|
263 |
+
<td class="has-text-centered">0.222</td>
|
264 |
+
<td class="has-text-centered">0.235</td>
|
265 |
+
</tr>
|
266 |
+
<tr>
|
267 |
+
<td>OpenAI o1-mini</td>
|
268 |
+
<td class="has-text-centered performance-strong">0.206</td>
|
269 |
+
<td class="has-text-centered performance-best">0.648</td>
|
270 |
+
<td class="has-text-centered performance-strong">0.206</td>
|
271 |
+
<td class="has-text-centered performance-strong">0.289</td>
|
272 |
+
<td class="has-text-centered">0.325</td>
|
273 |
+
<td class="has-text-centered">0.316</td>
|
274 |
+
<td class="has-text-centered">0.209</td>
|
275 |
+
<td class="has-text-centered">0.233</td>
|
276 |
+
</tr>
|
277 |
+
</tbody>
|
278 |
+
</table>
|
279 |
+
<div class="content is-small mt-4">
|
280 |
+
<p><strong>Note:</strong> Color highlighting indicates performance ranking:
|
281 |
+
<span class="performance-best"> Best </span>,
|
282 |
+
<span class="performance-medium"> Strong </span>,
|
283 |
+
<span class="performance-low"> Good </span>
|
284 |
+
</p>
|
285 |
+
</div>
|
286 |
+
</div>
|
287 |
+
</div>
|
final_results.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|
index.html
CHANGED
@@ -163,9 +163,9 @@
|
|
163 |
<div class="figure-container mb-5">
|
164 |
<figure class="image">
|
165 |
<!-- Note: In production, convert PDF to PNG/JPG -->
|
166 |
-
<img src="
|
167 |
<figcaption class="has-text-centered mt-2 is-italic">
|
168 |
-
<strong>Figure 1:</strong>
|
169 |
</figcaption>
|
170 |
</figure>
|
171 |
</div>
|
@@ -185,9 +185,9 @@
|
|
185 |
<div class="content">
|
186 |
<figure class="image">
|
187 |
<!-- Note: In production, convert PDF to PNG/JPG -->
|
188 |
-
<img src="
|
189 |
</figure>
|
190 |
-
<p class="mt-3">
|
191 |
</div>
|
192 |
</div>
|
193 |
</div>
|
@@ -206,9 +206,9 @@
|
|
206 |
<div class="content">
|
207 |
<figure class="image">
|
208 |
<!-- Note: In production, convert PDF to PNG/JPG -->
|
209 |
-
<img src="
|
210 |
</figure>
|
211 |
-
<p class="mt-3">
|
212 |
</div>
|
213 |
</div>
|
214 |
</div>
|
@@ -245,7 +245,7 @@
|
|
245 |
<div class="stat-box p-3">
|
246 |
<span class="icon is-large"><i class="fas fa-tasks fa-2x"></i></span>
|
247 |
<p class="is-size-5 mt-2"><strong>6</strong></p>
|
248 |
-
<p>FinNLP
|
249 |
</div>
|
250 |
</div>
|
251 |
<div class="column">
|
@@ -258,7 +258,7 @@
|
|
258 |
<div class="column">
|
259 |
<div class="stat-box p-3">
|
260 |
<span class="icon is-large"><i class="fas fa-robot fa-2x"></i></span>
|
261 |
-
<p class="is-size-5 mt-2"><strong>
|
262 |
<p>LLMs Evaluated</p>
|
263 |
</div>
|
264 |
</div>
|
@@ -458,11 +458,11 @@
|
|
458 |
</div>
|
459 |
<div class="card-content">
|
460 |
<div class="content">
|
461 |
-
<p>We developed a
|
462 |
-
<ul>
|
463 |
-
<li><strong>Tasks:</strong>
|
464 |
-
<li><strong>Domains:</strong>
|
465 |
-
<li><strong>Languages:</strong> Currently focusing on English with
|
466 |
</ul>
|
467 |
</div>
|
468 |
</div>
|
@@ -481,11 +481,11 @@
|
|
481 |
<div class="card-content">
|
482 |
<div class="content">
|
483 |
<p>We carefully selected datasets based on:</p>
|
484 |
-
<ul>
|
485 |
-
<li><strong>Domain relevance:</strong>
|
486 |
-
<li><strong>Licensing:</strong> Fair usage licensing and
|
487 |
-
<li><strong>Quality:</strong>
|
488 |
-
<li><strong>Complexity:</strong>
|
489 |
</ul>
|
490 |
</div>
|
491 |
</div>
|
@@ -507,7 +507,7 @@
|
|
507 |
<div class="columns">
|
508 |
<div class="column is-half">
|
509 |
<h6 class="title is-6">Proprietary</h6>
|
510 |
-
<ul>
|
511 |
<li>GPT-4o & o1-mini</li>
|
512 |
<li>Gemini-1.5</li>
|
513 |
<li>Claude3</li>
|
@@ -516,7 +516,7 @@
|
|
516 |
</div>
|
517 |
<div class="column is-half">
|
518 |
<h6 class="title is-6">Open-weight</h6>
|
519 |
-
<ul>
|
520 |
<li>Llama-3</li>
|
521 |
<li>DeepSeekV3 & R-1</li>
|
522 |
<li>Qwen-2 & QwQ</li>
|
@@ -542,14 +542,15 @@
|
|
542 |
<div class="content">
|
543 |
<p>Our two-stage evaluation approach includes:</p>
|
544 |
<ol>
|
545 |
-
<li><strong>Generation:</strong>
|
546 |
-
<li><strong>Extraction:</strong>
|
547 |
</ol>
|
548 |
-
<p>
|
549 |
-
<ul>
|
550 |
-
<li>
|
551 |
-
<li>
|
552 |
-
<li>
|
|
|
553 |
</ul>
|
554 |
</div>
|
555 |
</div>
|
@@ -574,8 +575,15 @@
|
|
574 |
<p class="is-size-5 has-text-centered mb-0">
|
575 |
Our comprehensive evaluation reveals significant performance variations across different financial tasks and models.
|
576 |
</p>
|
577 |
-
|
578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
579 |
<!-- Performance comparison chart -->
|
580 |
<div class="box mb-5">
|
581 |
<h4 class="title is-4 has-text-centered mb-4">Performance Across Tasks</h4>
|
@@ -709,7 +717,7 @@
|
|
709 |
</div>
|
710 |
|
711 |
<p class="mb-2">Key insights from our model analysis:</p>
|
712 |
-
<ul>
|
713 |
<li>Inconsistent scaling: larger parameter sizes do not guarantee higher performance</li>
|
714 |
<li>Open-weight models show competitive performance on many tasks</li>
|
715 |
<li>Dramatic price differences between models ($4-260 USD)</li>
|
@@ -890,7 +898,7 @@
|
|
890 |
</div>
|
891 |
<div class="card-content">
|
892 |
<div class="content">
|
893 |
-
<ul>
|
894 |
<li>Limited dataset size and diversity</li>
|
895 |
<li>Current focus on zero-shot scenarios only</li>
|
896 |
<li>English-language focus due to availability of benchmarks</li>
|
@@ -913,7 +921,7 @@
|
|
913 |
</div>
|
914 |
<div class="card-content">
|
915 |
<div class="content">
|
916 |
-
<ul>
|
917 |
<li>Extend to more languages beyond English</li>
|
918 |
<li>Explore few-shot and chain-of-thought prompting</li>
|
919 |
<li>Evaluate domain-adaptive training for finance</li>
|
@@ -1045,7 +1053,7 @@
|
|
1045 |
</span>
|
1046 |
Banking
|
1047 |
</p>
|
1048 |
-
<ul
|
1049 |
<li>Banking77</li>
|
1050 |
<li>FiQA</li>
|
1051 |
<li>FinRED</li>
|
@@ -1061,7 +1069,7 @@
|
|
1061 |
</span>
|
1062 |
Investment
|
1063 |
</p>
|
1064 |
-
<ul
|
1065 |
<li>FPB</li>
|
1066 |
<li>Headlines</li>
|
1067 |
<li>SubjectiveQA</li>
|
@@ -1077,7 +1085,7 @@
|
|
1077 |
</span>
|
1078 |
Accounting
|
1079 |
</p>
|
1080 |
-
<ul
|
1081 |
<li>FinQA</li>
|
1082 |
<li>TaT-QA</li>
|
1083 |
<li>ConvFinQA</li>
|
@@ -1093,7 +1101,7 @@
|
|
1093 |
</span>
|
1094 |
Corporate
|
1095 |
</p>
|
1096 |
-
<ul
|
1097 |
<li>ECTSum</li>
|
1098 |
<li>EDTSum</li>
|
1099 |
<li>FinCausal</li>
|
|
|
163 |
<div class="figure-container mb-5">
|
164 |
<figure class="image">
|
165 |
<!-- Note: In production, convert PDF to PNG/JPG -->
|
166 |
+
<img src="FLaME/content/figures/fig_overview_flow.jpg" alt="FLaME Evaluation Framework">
|
167 |
<figcaption class="has-text-centered mt-2 is-italic">
|
168 |
+
<strong>Figure 1:</strong> Functional Overview of FLaME. The evaluation framework includes a comprehensive taxonomy, carefully selected datasets, diverse models, and standardized metrics.
|
169 |
</figcaption>
|
170 |
</figure>
|
171 |
</div>
|
|
|
185 |
<div class="content">
|
186 |
<figure class="image">
|
187 |
<!-- Note: In production, convert PDF to PNG/JPG -->
|
188 |
+
<img src="FLaME/content/figures/fig_methodology_tasks.jpg" alt="FLaME Task Taxonomy">
|
189 |
</figure>
|
190 |
+
<p class="mt-3"><strong>Task Taxonomy:</strong> Illustrative breakdown for each of the six core NLP task categories (Classification, Sentiment Analysis, Information Retrieval, Causal Analysis, Summarization, and Question Answering). Each category encompasses specialized variants depending on data format, user needs, and domain constraints.</p>
|
191 |
</div>
|
192 |
</div>
|
193 |
</div>
|
|
|
206 |
<div class="content">
|
207 |
<figure class="image">
|
208 |
<!-- Note: In production, convert PDF to PNG/JPG -->
|
209 |
+
<img src="FLaME/content/figures/fig_methodology_domain.jpg" alt="FLaME Domain Taxonomy">
|
210 |
</figure>
|
211 |
+
<p class="mt-3"><strong>Domain Taxonomy:</strong> Holistic taxonomy for FLaME. Unlike previous FinNLP benchmarks that were tied to specific tasks with single metrics, FLaME takes a comprehensive approach by mapping the full space of tasks, scenarios, and metrics across multiple dimensions for complete analysis.</p>
|
212 |
</div>
|
213 |
</div>
|
214 |
</div>
|
|
|
245 |
<div class="stat-box p-3">
|
246 |
<span class="icon is-large"><i class="fas fa-tasks fa-2x"></i></span>
|
247 |
<p class="is-size-5 mt-2"><strong>6</strong></p>
|
248 |
+
<p>FinNLP Task Categories</p>
|
249 |
</div>
|
250 |
</div>
|
251 |
<div class="column">
|
|
|
258 |
<div class="column">
|
259 |
<div class="stat-box p-3">
|
260 |
<span class="icon is-large"><i class="fas fa-robot fa-2x"></i></span>
|
261 |
+
<p class="is-size-5 mt-2"><strong>24</strong></p>
|
262 |
<p>LLMs Evaluated</p>
|
263 |
</div>
|
264 |
</div>
|
|
|
458 |
</div>
|
459 |
<div class="card-content">
|
460 |
<div class="content">
|
461 |
+
<p>We developed a scenario-based taxonomy organizing financial NLP tasks along three dimensions:</p>
|
462 |
+
<ul style="list-style-type: none;">
|
463 |
+
<li><strong>Tasks:</strong> Six core tasks - text classification, sentiment analysis, information retrieval, causal analysis, text summarization, and question answering</li>
|
464 |
+
<li><strong>Domains:</strong> Categorized by what (type of data), who (data source), where (origination), when (time period), how (generation method), and why (purpose)</li>
|
465 |
+
<li><strong>Languages:</strong> Currently focusing on English with identified need for multilingual expansion</li>
|
466 |
</ul>
|
467 |
</div>
|
468 |
</div>
|
|
|
481 |
<div class="card-content">
|
482 |
<div class="content">
|
483 |
<p>We carefully selected datasets based on:</p>
|
484 |
+
<ul style="list-style-type: none;">
|
485 |
+
<li><strong>Domain relevance:</strong> Majority of content directly related to finance</li>
|
486 |
+
<li><strong>Licensing:</strong> Fair usage licensing and proper attribution</li>
|
487 |
+
<li><strong>Quality:</strong> Transparent sourcing with minimal risk of label corruption</li>
|
488 |
+
<li><strong>Complexity:</strong> Exercises real financial knowledge, not trivial tasks</li>
|
489 |
</ul>
|
490 |
</div>
|
491 |
</div>
|
|
|
507 |
<div class="columns">
|
508 |
<div class="column is-half">
|
509 |
<h6 class="title is-6">Proprietary</h6>
|
510 |
+
<ul style="list-style-type: none;">
|
511 |
<li>GPT-4o & o1-mini</li>
|
512 |
<li>Gemini-1.5</li>
|
513 |
<li>Claude3</li>
|
|
|
516 |
</div>
|
517 |
<div class="column is-half">
|
518 |
<h6 class="title is-6">Open-weight</h6>
|
519 |
+
<ul style="list-style-type: none;">
|
520 |
<li>Llama-3</li>
|
521 |
<li>DeepSeekV3 & R-1</li>
|
522 |
<li>Qwen-2 & QwQ</li>
|
|
|
542 |
<div class="content">
|
543 |
<p>Our two-stage evaluation approach includes:</p>
|
544 |
<ol>
|
545 |
+
<li><strong>Generation:</strong> Language model generates responses to task-specific inputs</li>
|
546 |
+
<li><strong>Extraction:</strong> Separate process identifies relevant output using structured pattern matching</li>
|
547 |
</ol>
|
548 |
+
<p>Pipeline stages:</p>
|
549 |
+
<ul style="list-style-type: none;">
|
550 |
+
<li>Configuration of tasks, datasets, and parameters</li>
|
551 |
+
<li>Model interaction via local instantiation or API</li>
|
552 |
+
<li>Post-processing and structured output extraction</li>
|
553 |
+
<li>Task-specific metric computation and logging</li>
|
554 |
</ul>
|
555 |
</div>
|
556 |
</div>
|
|
|
575 |
<p class="is-size-5 has-text-centered mb-0">
|
576 |
Our comprehensive evaluation reveals significant performance variations across different financial tasks and models.
|
577 |
</p>
|
578 |
+
<p class="has-text-centered mt-3">
|
579 |
+
<a href="final_results.html" class="button is-primary">
|
580 |
+
<span class="icon">
|
581 |
+
<i class="fas fa-table"></i>
|
582 |
+
</span>
|
583 |
+
<span>View Detailed Results Tables</span>
|
584 |
+
</a>
|
585 |
+
</p>
|
586 |
+
</div>
|
587 |
<!-- Performance comparison chart -->
|
588 |
<div class="box mb-5">
|
589 |
<h4 class="title is-4 has-text-centered mb-4">Performance Across Tasks</h4>
|
|
|
717 |
</div>
|
718 |
|
719 |
<p class="mb-2">Key insights from our model analysis:</p>
|
720 |
+
<ul style="list-style-type: none;">
|
721 |
<li>Inconsistent scaling: larger parameter sizes do not guarantee higher performance</li>
|
722 |
<li>Open-weight models show competitive performance on many tasks</li>
|
723 |
<li>Dramatic price differences between models ($4-260 USD)</li>
|
|
|
898 |
</div>
|
899 |
<div class="card-content">
|
900 |
<div class="content">
|
901 |
+
<ul style="list-style-type: none;">
|
902 |
<li>Limited dataset size and diversity</li>
|
903 |
<li>Current focus on zero-shot scenarios only</li>
|
904 |
<li>English-language focus due to availability of benchmarks</li>
|
|
|
921 |
</div>
|
922 |
<div class="card-content">
|
923 |
<div class="content">
|
924 |
+
<ul style="list-style-type: none;">
|
925 |
<li>Extend to more languages beyond English</li>
|
926 |
<li>Explore few-shot and chain-of-thought prompting</li>
|
927 |
<li>Evaluate domain-adaptive training for finance</li>
|
|
|
1053 |
</span>
|
1054 |
Banking
|
1055 |
</p>
|
1056 |
+
<ul style="list-style-type: none;">
|
1057 |
<li>Banking77</li>
|
1058 |
<li>FiQA</li>
|
1059 |
<li>FinRED</li>
|
|
|
1069 |
</span>
|
1070 |
Investment
|
1071 |
</p>
|
1072 |
+
<ul style="list-style-type: none;">
|
1073 |
<li>FPB</li>
|
1074 |
<li>Headlines</li>
|
1075 |
<li>SubjectiveQA</li>
|
|
|
1085 |
</span>
|
1086 |
Accounting
|
1087 |
</p>
|
1088 |
+
<ul style="list-style-type: none;">
|
1089 |
<li>FinQA</li>
|
1090 |
<li>TaT-QA</li>
|
1091 |
<li>ConvFinQA</li>
|
|
|
1101 |
</span>
|
1102 |
Corporate
|
1103 |
</p>
|
1104 |
+
<ul style="list-style-type: none;">
|
1105 |
<li>ECTSum</li>
|
1106 |
<li>EDTSum</li>
|
1107 |
<li>FinCausal</li>
|
information_retrieval_table.html
ADDED
@@ -0,0 +1,555 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- Information Retrieval -->
|
2 |
+
<div id="information-retrieval" class="tab-content">
|
3 |
+
<h2 class="title is-4">Information Retrieval Task Results</h2>
|
4 |
+
<div class="results-table">
|
5 |
+
<table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth">
|
6 |
+
<thead>
|
7 |
+
<tr>
|
8 |
+
<th rowspan="2">Model</th>
|
9 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="A manually annotated dataset of 47,851 financial news articles with named entity annotations for person (PER), location (LOC), and organization (ORG) entities. Used for benchmarking financial named entity recognition performance.">FiNER</th>
|
10 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="A dataset for financial relation extraction focusing on relations between companies and financial metrics. Contains entity-relationship annotations from financial news, earnings reports, and regulatory filings.">FinRed</th>
|
11 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="A dataset for information retrieval in the financial domain with queries and relevant document passages. Contains 6,500 queries and 280,000 financial document passages annotated for relevance.">ReFiND</th>
|
12 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="A dataset of financial news articles labeled for entity extraction and document classification. Contains 1,500 articles with entity annotations and multi-label categorization for financial topics.">FNXL</th>
|
13 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="A dataset for financial entity recognition with 6,200 documents containing annotations for company names, financial metrics, dates, and numerical values from earnings reports and financial news.">FinEntity</th>
|
14 |
+
</tr>
|
15 |
+
<tr>
|
16 |
+
<th class="has-text-centered">Precision</th>
|
17 |
+
<th class="has-text-centered">Recall</th>
|
18 |
+
<th class="has-text-centered">F1</th>
|
19 |
+
<th class="has-text-centered">Accuracy</th>
|
20 |
+
<th class="has-text-centered">Accuracy</th>
|
21 |
+
<th class="has-text-centered">Precision</th>
|
22 |
+
<th class="has-text-centered">Recall</th>
|
23 |
+
<th class="has-text-centered">F1</th>
|
24 |
+
<th class="has-text-centered">Accuracy</th>
|
25 |
+
<th class="has-text-centered">Precision</th>
|
26 |
+
<th class="has-text-centered">Recall</th>
|
27 |
+
<th class="has-text-centered">F1</th>
|
28 |
+
<th class="has-text-centered">Precision</th>
|
29 |
+
<th class="has-text-centered">Recall</th>
|
30 |
+
<th class="has-text-centered">F1</th>
|
31 |
+
<th class="has-text-centered">Accuracy</th>
|
32 |
+
<th class="has-text-centered">Precision</th>
|
33 |
+
<th class="has-text-centered">Recall</th>
|
34 |
+
<th class="has-text-centered">Accuracy</th>
|
35 |
+
<th class="has-text-centered">F1</th>
|
36 |
+
</tr>
|
37 |
+
</thead>
|
38 |
+
<tbody>
|
39 |
+
<tr>
|
40 |
+
<td>Llama 3 70B Instruct</td>
|
41 |
+
<td class="has-text-centered">0.715</td>
|
42 |
+
<td class="has-text-centered">0.693</td>
|
43 |
+
<td class="has-text-centered">0.701</td>
|
44 |
+
<td class="has-text-centered">0.911</td>
|
45 |
+
<td class="has-text-centered">0.314</td>
|
46 |
+
<td class="has-text-centered performance-medium">0.454</td>
|
47 |
+
<td class="has-text-centered">0.314</td>
|
48 |
+
<td class="has-text-centered">0.332</td>
|
49 |
+
<td class="has-text-centered">0.879</td>
|
50 |
+
<td class="has-text-centered">0.904</td>
|
51 |
+
<td class="has-text-centered">0.879</td>
|
52 |
+
<td class="has-text-centered">0.883</td>
|
53 |
+
<td class="has-text-centered">0.015</td>
|
54 |
+
<td class="has-text-centered">0.030</td>
|
55 |
+
<td class="has-text-centered">0.020</td>
|
56 |
+
<td class="has-text-centered">0.010</td>
|
57 |
+
<td class="has-text-centered">0.474</td>
|
58 |
+
<td class="has-text-centered">0.485</td>
|
59 |
+
<td class="has-text-centered">0.485</td>
|
60 |
+
<td class="has-text-centered">0.469</td>
|
61 |
+
</tr>
|
62 |
+
<tr>
|
63 |
+
<td>Llama 3 8B Instruct</td>
|
64 |
+
<td class="has-text-centered">0.581</td>
|
65 |
+
<td class="has-text-centered">0.558</td>
|
66 |
+
<td class="has-text-centered">0.565</td>
|
67 |
+
<td class="has-text-centered">0.854</td>
|
68 |
+
<td class="has-text-centered">0.296</td>
|
69 |
+
<td class="has-text-centered">0.357</td>
|
70 |
+
<td class="has-text-centered">0.296</td>
|
71 |
+
<td class="has-text-centered">0.289</td>
|
72 |
+
<td class="has-text-centered">0.723</td>
|
73 |
+
<td class="has-text-centered">0.755</td>
|
74 |
+
<td class="has-text-centered">0.723</td>
|
75 |
+
<td class="has-text-centered">0.705</td>
|
76 |
+
<td class="has-text-centered">0.003</td>
|
77 |
+
<td class="has-text-centered">0.004</td>
|
78 |
+
<td class="has-text-centered">0.003</td>
|
79 |
+
<td class="has-text-centered">0.002</td>
|
80 |
+
<td class="has-text-centered">0.301</td>
|
81 |
+
<td class="has-text-centered">0.478</td>
|
82 |
+
<td class="has-text-centered">0.478</td>
|
83 |
+
<td class="has-text-centered">0.350</td>
|
84 |
+
</tr>
|
85 |
+
<tr>
|
86 |
+
<td>DBRX Instruct</td>
|
87 |
+
<td class="has-text-centered">0.516</td>
|
88 |
+
<td class="has-text-centered">0.476</td>
|
89 |
+
<td class="has-text-centered">0.489</td>
|
90 |
+
<td class="has-text-centered">0.802</td>
|
91 |
+
<td class="has-text-centered">0.329</td>
|
92 |
+
<td class="has-text-centered">0.371</td>
|
93 |
+
<td class="has-text-centered">0.329</td>
|
94 |
+
<td class="has-text-centered">0.304</td>
|
95 |
+
<td class="has-text-centered">0.766</td>
|
96 |
+
<td class="has-text-centered">0.825</td>
|
97 |
+
<td class="has-text-centered">0.766</td>
|
98 |
+
<td class="has-text-centered">0.778</td>
|
99 |
+
<td class="has-text-centered">0.008</td>
|
100 |
+
<td class="has-text-centered">0.011</td>
|
101 |
+
<td class="has-text-centered">0.009</td>
|
102 |
+
<td class="has-text-centered">0.005</td>
|
103 |
+
<td class="has-text-centered">0.004</td>
|
104 |
+
<td class="has-text-centered">0.014</td>
|
105 |
+
<td class="has-text-centered">0.014</td>
|
106 |
+
<td class="has-text-centered">0.006</td>
|
107 |
+
</tr>
|
108 |
+
<tr>
|
109 |
+
<td>DeepSeek LLM (67B)</td>
|
110 |
+
<td class="has-text-centered">0.752</td>
|
111 |
+
<td class="has-text-centered">0.742</td>
|
112 |
+
<td class="has-text-centered">0.745</td>
|
113 |
+
<td class="has-text-centered">0.917</td>
|
114 |
+
<td class="has-text-centered">0.344</td>
|
115 |
+
<td class="has-text-centered">0.403</td>
|
116 |
+
<td class="has-text-centered">0.344</td>
|
117 |
+
<td class="has-text-centered">0.334</td>
|
118 |
+
<td class="has-text-centered">0.874</td>
|
119 |
+
<td class="has-text-centered">0.890</td>
|
120 |
+
<td class="has-text-centered">0.874</td>
|
121 |
+
<td class="has-text-centered">0.879</td>
|
122 |
+
<td class="has-text-centered">0.005</td>
|
123 |
+
<td class="has-text-centered">0.009</td>
|
124 |
+
<td class="has-text-centered">0.007</td>
|
125 |
+
<td class="has-text-centered">0.003</td>
|
126 |
+
<td class="has-text-centered">0.456</td>
|
127 |
+
<td class="has-text-centered">0.405</td>
|
128 |
+
<td class="has-text-centered">0.405</td>
|
129 |
+
<td class="has-text-centered">0.416</td>
|
130 |
+
</tr>
|
131 |
+
<tr>
|
132 |
+
<td>Gemma 2 27B</td>
|
133 |
+
<td class="has-text-centered">0.772</td>
|
134 |
+
<td class="has-text-centered">0.754</td>
|
135 |
+
<td class="has-text-centered">0.761</td>
|
136 |
+
<td class="has-text-centered performance-medium">0.923</td>
|
137 |
+
<td class="has-text-centered">0.352</td>
|
138 |
+
<td class="has-text-centered">0.437</td>
|
139 |
+
<td class="has-text-centered">0.352</td>
|
140 |
+
<td class="has-text-centered">0.356</td>
|
141 |
+
<td class="has-text-centered">0.897</td>
|
142 |
+
<td class="has-text-centered">0.914</td>
|
143 |
+
<td class="has-text-centered">0.897</td>
|
144 |
+
<td class="has-text-centered">0.902</td>
|
145 |
+
<td class="has-text-centered">0.005</td>
|
146 |
+
<td class="has-text-centered">0.008</td>
|
147 |
+
<td class="has-text-centered">0.006</td>
|
148 |
+
<td class="has-text-centered">0.003</td>
|
149 |
+
<td class="has-text-centered">0.320</td>
|
150 |
+
<td class="has-text-centered">0.295</td>
|
151 |
+
<td class="has-text-centered">0.295</td>
|
152 |
+
<td class="has-text-centered">0.298</td>
|
153 |
+
</tr>
|
154 |
+
<tr>
|
155 |
+
<td>Gemma 2 9B</td>
|
156 |
+
<td class="has-text-centered">0.665</td>
|
157 |
+
<td class="has-text-centered">0.643</td>
|
158 |
+
<td class="has-text-centered">0.651</td>
|
159 |
+
<td class="has-text-centered">0.886</td>
|
160 |
+
<td class="has-text-centered">0.336</td>
|
161 |
+
<td class="has-text-centered">0.373</td>
|
162 |
+
<td class="has-text-centered">0.336</td>
|
163 |
+
<td class="has-text-centered">0.331</td>
|
164 |
+
<td class="has-text-centered">0.885</td>
|
165 |
+
<td class="has-text-centered">0.902</td>
|
166 |
+
<td class="has-text-centered">0.885</td>
|
167 |
+
<td class="has-text-centered">0.892</td>
|
168 |
+
<td class="has-text-centered">0.004</td>
|
169 |
+
<td class="has-text-centered">0.008</td>
|
170 |
+
<td class="has-text-centered">0.005</td>
|
171 |
+
<td class="has-text-centered">0.003</td>
|
172 |
+
<td class="has-text-centered">0.348</td>
|
173 |
+
<td class="has-text-centered">0.419</td>
|
174 |
+
<td class="has-text-centered">0.419</td>
|
175 |
+
<td class="has-text-centered">0.367</td>
|
176 |
+
</tr>
|
177 |
+
<tr>
|
178 |
+
<td>Mistral (7B) Instruct v0.3</td>
|
179 |
+
<td class="has-text-centered">0.540</td>
|
180 |
+
<td class="has-text-centered">0.522</td>
|
181 |
+
<td class="has-text-centered">0.526</td>
|
182 |
+
<td class="has-text-centered">0.806</td>
|
183 |
+
<td class="has-text-centered">0.278</td>
|
184 |
+
<td class="has-text-centered">0.383</td>
|
185 |
+
<td class="has-text-centered">0.278</td>
|
186 |
+
<td class="has-text-centered">0.276</td>
|
187 |
+
<td class="has-text-centered">0.767</td>
|
188 |
+
<td class="has-text-centered">0.817</td>
|
189 |
+
<td class="has-text-centered">0.767</td>
|
190 |
+
<td class="has-text-centered">0.771</td>
|
191 |
+
<td class="has-text-centered">0.004</td>
|
192 |
+
<td class="has-text-centered">0.006</td>
|
193 |
+
<td class="has-text-centered">0.004</td>
|
194 |
+
<td class="has-text-centered">0.002</td>
|
195 |
+
<td class="has-text-centered">0.337</td>
|
196 |
+
<td class="has-text-centered">0.477</td>
|
197 |
+
<td class="has-text-centered">0.477</td>
|
198 |
+
<td class="has-text-centered">0.368</td>
|
199 |
+
</tr>
|
200 |
+
<tr>
|
201 |
+
<td>Mixtral-8x22B Instruct</td>
|
202 |
+
<td class="has-text-centered">0.653</td>
|
203 |
+
<td class="has-text-centered">0.625</td>
|
204 |
+
<td class="has-text-centered">0.635</td>
|
205 |
+
<td class="has-text-centered">0.870</td>
|
206 |
+
<td class="has-text-centered">0.381</td>
|
207 |
+
<td class="has-text-centered">0.414</td>
|
208 |
+
<td class="has-text-centered">0.381</td>
|
209 |
+
<td class="has-text-centered">0.367</td>
|
210 |
+
<td class="has-text-centered">0.807</td>
|
211 |
+
<td class="has-text-centered">0.847</td>
|
212 |
+
<td class="has-text-centered">0.807</td>
|
213 |
+
<td class="has-text-centered">0.811</td>
|
214 |
+
<td class="has-text-centered">0.010</td>
|
215 |
+
<td class="has-text-centered">0.008</td>
|
216 |
+
<td class="has-text-centered">0.009</td>
|
217 |
+
<td class="has-text-centered">0.005</td>
|
218 |
+
<td class="has-text-centered">0.428</td>
|
219 |
+
<td class="has-text-centered">0.481</td>
|
220 |
+
<td class="has-text-centered">0.481</td>
|
221 |
+
<td class="has-text-centered">0.435</td>
|
222 |
+
</tr>
|
223 |
+
<tr>
|
224 |
+
<td>Mixtral-8x7B Instruct</td>
|
225 |
+
<td class="has-text-centered">0.613</td>
|
226 |
+
<td class="has-text-centered">0.591</td>
|
227 |
+
<td class="has-text-centered">0.598</td>
|
228 |
+
<td class="has-text-centered">0.875</td>
|
229 |
+
<td class="has-text-centered">0.291</td>
|
230 |
+
<td class="has-text-centered">0.376</td>
|
231 |
+
<td class="has-text-centered">0.291</td>
|
232 |
+
<td class="has-text-centered">0.282</td>
|
233 |
+
<td class="has-text-centered">0.840</td>
|
234 |
+
<td class="has-text-centered">0.863</td>
|
235 |
+
<td class="has-text-centered">0.840</td>
|
236 |
+
<td class="has-text-centered">0.845</td>
|
237 |
+
<td class="has-text-centered">0.007</td>
|
238 |
+
<td class="has-text-centered">0.012</td>
|
239 |
+
<td class="has-text-centered">0.009</td>
|
240 |
+
<td class="has-text-centered">0.005</td>
|
241 |
+
<td class="has-text-centered">0.251</td>
|
242 |
+
<td class="has-text-centered">0.324</td>
|
243 |
+
<td class="has-text-centered">0.324</td>
|
244 |
+
<td class="has-text-centered">0.267</td>
|
245 |
+
</tr>
|
246 |
+
<tr>
|
247 |
+
<td>Qwen 2 Instruct (72B)</td>
|
248 |
+
<td class="has-text-centered">0.766</td>
|
249 |
+
<td class="has-text-centered">0.742</td>
|
250 |
+
<td class="has-text-centered">0.748</td>
|
251 |
+
<td class="has-text-centered">0.899</td>
|
252 |
+
<td class="has-text-centered">0.365</td>
|
253 |
+
<td class="has-text-centered">0.407</td>
|
254 |
+
<td class="has-text-centered">0.365</td>
|
255 |
+
<td class="has-text-centered">0.348</td>
|
256 |
+
<td class="has-text-centered">0.850</td>
|
257 |
+
<td class="has-text-centered">0.881</td>
|
258 |
+
<td class="has-text-centered">0.850</td>
|
259 |
+
<td class="has-text-centered">0.854</td>
|
260 |
+
<td class="has-text-centered">0.010</td>
|
261 |
+
<td class="has-text-centered">0.016</td>
|
262 |
+
<td class="has-text-centered">0.012</td>
|
263 |
+
<td class="has-text-centered">0.006</td>
|
264 |
+
<td class="has-text-centered">0.468</td>
|
265 |
+
<td class="has-text-centered">0.530</td>
|
266 |
+
<td class="has-text-centered">0.530</td>
|
267 |
+
<td class="has-text-centered">0.483</td>
|
268 |
+
</tr>
|
269 |
+
<tr>
|
270 |
+
<td>WizardLM-2 8x22B</td>
|
271 |
+
<td class="has-text-centered">0.755</td>
|
272 |
+
<td class="has-text-centered">0.741</td>
|
273 |
+
<td class="has-text-centered">0.744</td>
|
274 |
+
<td class="has-text-centered">0.920</td>
|
275 |
+
<td class="has-text-centered">0.362</td>
|
276 |
+
<td class="has-text-centered">0.397</td>
|
277 |
+
<td class="has-text-centered">0.362</td>
|
278 |
+
<td class="has-text-centered">0.355</td>
|
279 |
+
<td class="has-text-centered">0.846</td>
|
280 |
+
<td class="has-text-centered">0.874</td>
|
281 |
+
<td class="has-text-centered">0.846</td>
|
282 |
+
<td class="has-text-centered">0.852</td>
|
283 |
+
<td class="has-text-centered">0.008</td>
|
284 |
+
<td class="has-text-centered">0.009</td>
|
285 |
+
<td class="has-text-centered">0.008</td>
|
286 |
+
<td class="has-text-centered">0.004</td>
|
287 |
+
<td class="has-text-centered">0.222</td>
|
288 |
+
<td class="has-text-centered">0.247</td>
|
289 |
+
<td class="has-text-centered">0.247</td>
|
290 |
+
<td class="has-text-centered">0.226</td>
|
291 |
+
</tr>
|
292 |
+
<tr>
|
293 |
+
<td>DeepSeek-V3</td>
|
294 |
+
<td class="has-text-centered performance-medium">0.798</td>
|
295 |
+
<td class="has-text-centered performance-medium">0.787</td>
|
296 |
+
<td class="has-text-centered performance-medium">0.790</td>
|
297 |
+
<td class="has-text-centered performance-best">0.945</td>
|
298 |
+
<td class="has-text-centered performance-strong">0.450</td>
|
299 |
+
<td class="has-text-centered performance-strong">0.463</td>
|
300 |
+
<td class="has-text-centered performance-strong">0.450</td>
|
301 |
+
<td class="has-text-centered performance-strong">0.437</td>
|
302 |
+
<td class="has-text-centered">0.927</td>
|
303 |
+
<td class="has-text-centered performance-medium">0.943</td>
|
304 |
+
<td class="has-text-centered">0.927</td>
|
305 |
+
<td class="has-text-centered">0.934</td>
|
306 |
+
<td class="has-text-centered performance-strong">0.034</td>
|
307 |
+
<td class="has-text-centered performance-medium">0.067</td>
|
308 |
+
<td class="has-text-centered performance-medium">0.045</td>
|
309 |
+
<td class="has-text-centered performance-medium">0.023</td>
|
310 |
+
<td class="has-text-centered">0.563</td>
|
311 |
+
<td class="has-text-centered">0.544</td>
|
312 |
+
<td class="has-text-centered">0.544</td>
|
313 |
+
<td class="has-text-centered">0.549</td>
|
314 |
+
</tr>
|
315 |
+
<tr>
|
316 |
+
<td>DeepSeek R1</td>
|
317 |
+
<td class="has-text-centered performance-best">0.813</td>
|
318 |
+
<td class="has-text-centered performance-best">0.805</td>
|
319 |
+
<td class="has-text-centered performance-best">0.807</td>
|
320 |
+
<td class="has-text-centered performance-strong">0.944</td>
|
321 |
+
<td class="has-text-centered performance-medium">0.412</td>
|
322 |
+
<td class="has-text-centered">0.424</td>
|
323 |
+
<td class="has-text-centered performance-medium">0.412</td>
|
324 |
+
<td class="has-text-centered">0.393</td>
|
325 |
+
<td class="has-text-centered performance-best">0.946</td>
|
326 |
+
<td class="has-text-centered performance-best">0.960</td>
|
327 |
+
<td class="has-text-centered performance-best">0.946</td>
|
328 |
+
<td class="has-text-centered performance-best">0.952</td>
|
329 |
+
<td class="has-text-centered performance-best">0.044</td>
|
330 |
+
<td class="has-text-centered performance-best">0.082</td>
|
331 |
+
<td class="has-text-centered performance-best">0.057</td>
|
332 |
+
<td class="has-text-centered performance-best">0.029</td>
|
333 |
+
<td class="has-text-centered performance-medium">0.600</td>
|
334 |
+
<td class="has-text-centered performance-medium">0.586</td>
|
335 |
+
<td class="has-text-centered performance-medium">0.586</td>
|
336 |
+
<td class="has-text-centered performance-medium">0.587</td>
|
337 |
+
</tr>
|
338 |
+
<tr>
|
339 |
+
<td>QwQ-32B-Preview</td>
|
340 |
+
<td class="has-text-centered">0.695</td>
|
341 |
+
<td class="has-text-centered">0.681</td>
|
342 |
+
<td class="has-text-centered">0.685</td>
|
343 |
+
<td class="has-text-centered">0.907</td>
|
344 |
+
<td class="has-text-centered">0.278</td>
|
345 |
+
<td class="has-text-centered">0.396</td>
|
346 |
+
<td class="has-text-centered">0.278</td>
|
347 |
+
<td class="has-text-centered">0.270</td>
|
348 |
+
<td class="has-text-centered">0.680</td>
|
349 |
+
<td class="has-text-centered">0.770</td>
|
350 |
+
<td class="has-text-centered">0.680</td>
|
351 |
+
<td class="has-text-centered">0.656</td>
|
352 |
+
<td class="has-text-centered">0.001</td>
|
353 |
+
<td class="has-text-centered">0.001</td>
|
354 |
+
<td class="has-text-centered">0.001</td>
|
355 |
+
<td class="has-text-centered">0.000</td>
|
356 |
+
<td class="has-text-centered">0.005</td>
|
357 |
+
<td class="has-text-centered">0.005</td>
|
358 |
+
<td class="has-text-centered">0.005</td>
|
359 |
+
<td class="has-text-centered">0.005</td>
|
360 |
+
</tr>
|
361 |
+
<tr>
|
362 |
+
<td>Jamba 1.5 Mini</td>
|
363 |
+
<td class="has-text-centered">0.564</td>
|
364 |
+
<td class="has-text-centered">0.556</td>
|
365 |
+
<td class="has-text-centered">0.552</td>
|
366 |
+
<td class="has-text-centered">0.818</td>
|
367 |
+
<td class="has-text-centered">0.308</td>
|
368 |
+
<td class="has-text-centered">0.450</td>
|
369 |
+
<td class="has-text-centered">0.308</td>
|
370 |
+
<td class="has-text-centered">0.284</td>
|
371 |
+
<td class="has-text-centered">0.830</td>
|
372 |
+
<td class="has-text-centered">0.864</td>
|
373 |
+
<td class="has-text-centered">0.830</td>
|
374 |
+
<td class="has-text-centered">0.844</td>
|
375 |
+
<td class="has-text-centered">0.004</td>
|
376 |
+
<td class="has-text-centered">0.006</td>
|
377 |
+
<td class="has-text-centered">0.005</td>
|
378 |
+
<td class="has-text-centered">0.003</td>
|
379 |
+
<td class="has-text-centered">0.119</td>
|
380 |
+
<td class="has-text-centered">0.182</td>
|
381 |
+
<td class="has-text-centered">0.182</td>
|
382 |
+
<td class="has-text-centered">0.132</td>
|
383 |
+
</tr>
|
384 |
+
<tr>
|
385 |
+
<td>Jamba 1.5 Large</td>
|
386 |
+
<td class="has-text-centered">0.707</td>
|
387 |
+
<td class="has-text-centered">0.687</td>
|
388 |
+
<td class="has-text-centered">0.693</td>
|
389 |
+
<td class="has-text-centered">0.883</td>
|
390 |
+
<td class="has-text-centered">0.341</td>
|
391 |
+
<td class="has-text-centered">0.452</td>
|
392 |
+
<td class="has-text-centered">0.341</td>
|
393 |
+
<td class="has-text-centered">0.341</td>
|
394 |
+
<td class="has-text-centered">0.856</td>
|
395 |
+
<td class="has-text-centered">0.890</td>
|
396 |
+
<td class="has-text-centered">0.856</td>
|
397 |
+
<td class="has-text-centered">0.862</td>
|
398 |
+
<td class="has-text-centered">0.004</td>
|
399 |
+
<td class="has-text-centered">0.005</td>
|
400 |
+
<td class="has-text-centered">0.005</td>
|
401 |
+
<td class="has-text-centered">0.002</td>
|
402 |
+
<td class="has-text-centered">0.403</td>
|
403 |
+
<td class="has-text-centered">0.414</td>
|
404 |
+
<td class="has-text-centered">0.414</td>
|
405 |
+
<td class="has-text-centered">0.397</td>
|
406 |
+
</tr>
|
407 |
+
<tr>
|
408 |
+
<td>Claude 3.5 Sonnet</td>
|
409 |
+
<td class="has-text-centered performance-strong">0.811</td>
|
410 |
+
<td class="has-text-centered performance-strong">0.794</td>
|
411 |
+
<td class="has-text-centered performance-strong">0.799</td>
|
412 |
+
<td class="has-text-centered">0.922</td>
|
413 |
+
<td class="has-text-centered performance-best">0.455</td>
|
414 |
+
<td class="has-text-centered performance-best">0.465</td>
|
415 |
+
<td class="has-text-centered performance-best">0.455</td>
|
416 |
+
<td class="has-text-centered performance-best">0.439</td>
|
417 |
+
<td class="has-text-centered">0.873</td>
|
418 |
+
<td class="has-text-centered">0.927</td>
|
419 |
+
<td class="has-text-centered">0.873</td>
|
420 |
+
<td class="has-text-centered">0.891</td>
|
421 |
+
<td class="has-text-centered performance-strong">0.034</td>
|
422 |
+
<td class="has-text-centered performance-strong">0.080</td>
|
423 |
+
<td class="has-text-centered performance-strong">0.047</td>
|
424 |
+
<td class="has-text-centered performance-strong">0.024</td>
|
425 |
+
<td class="has-text-centered performance-strong">0.658</td>
|
426 |
+
<td class="has-text-centered performance-strong">0.668</td>
|
427 |
+
<td class="has-text-centered performance-strong">0.668</td>
|
428 |
+
<td class="has-text-centered performance-strong">0.655</td>
|
429 |
+
</tr>
|
430 |
+
<tr>
|
431 |
+
<td>Claude 3 Haiku</td>
|
432 |
+
<td class="has-text-centered">0.732</td>
|
433 |
+
<td class="has-text-centered">0.700</td>
|
434 |
+
<td class="has-text-centered">0.711</td>
|
435 |
+
<td class="has-text-centered">0.895</td>
|
436 |
+
<td class="has-text-centered">0.294</td>
|
437 |
+
<td class="has-text-centered">0.330</td>
|
438 |
+
<td class="has-text-centered">0.294</td>
|
439 |
+
<td class="has-text-centered">0.285</td>
|
440 |
+
<td class="has-text-centered">0.879</td>
|
441 |
+
<td class="has-text-centered">0.917</td>
|
442 |
+
<td class="has-text-centered">0.879</td>
|
443 |
+
<td class="has-text-centered">0.883</td>
|
444 |
+
<td class="has-text-centered">0.011</td>
|
445 |
+
<td class="has-text-centered">0.022</td>
|
446 |
+
<td class="has-text-centered">0.015</td>
|
447 |
+
<td class="has-text-centered">0.008</td>
|
448 |
+
<td class="has-text-centered">0.498</td>
|
449 |
+
<td class="has-text-centered">0.517</td>
|
450 |
+
<td class="has-text-centered">0.517</td>
|
451 |
+
<td class="has-text-centered">0.494</td>
|
452 |
+
</tr>
|
453 |
+
<tr>
|
454 |
+
<td>Cohere Command R +</td>
|
455 |
+
<td class="has-text-centered">0.769</td>
|
456 |
+
<td class="has-text-centered">0.750</td>
|
457 |
+
<td class="has-text-centered">0.756</td>
|
458 |
+
<td class="has-text-centered">0.902</td>
|
459 |
+
<td class="has-text-centered">0.353</td>
|
460 |
+
<td class="has-text-centered">0.405</td>
|
461 |
+
<td class="has-text-centered">0.353</td>
|
462 |
+
<td class="has-text-centered">0.333</td>
|
463 |
+
<td class="has-text-centered">0.917</td>
|
464 |
+
<td class="has-text-centered">0.930</td>
|
465 |
+
<td class="has-text-centered">0.917</td>
|
466 |
+
<td class="has-text-centered">0.922</td>
|
467 |
+
<td class="has-text-centered">0.016</td>
|
468 |
+
<td class="has-text-centered">0.032</td>
|
469 |
+
<td class="has-text-centered">0.021</td>
|
470 |
+
<td class="has-text-centered">0.011</td>
|
471 |
+
<td class="has-text-centered">0.462</td>
|
472 |
+
<td class="has-text-centered">0.459</td>
|
473 |
+
<td class="has-text-centered">0.459</td>
|
474 |
+
<td class="has-text-centered">0.452</td>
|
475 |
+
</tr>
|
476 |
+
<tr>
|
477 |
+
<td>Google Gemini 1.5 Pro</td>
|
478 |
+
<td class="has-text-centered">0.728</td>
|
479 |
+
<td class="has-text-centered">0.705</td>
|
480 |
+
<td class="has-text-centered">0.712</td>
|
481 |
+
<td class="has-text-centered">0.891</td>
|
482 |
+
<td class="has-text-centered">0.373</td>
|
483 |
+
<td class="has-text-centered">0.436</td>
|
484 |
+
<td class="has-text-centered">0.373</td>
|
485 |
+
<td class="has-text-centered">0.374</td>
|
486 |
+
<td class="has-text-centered performance-strong">0.934</td>
|
487 |
+
<td class="has-text-centered performance-strong">0.955</td>
|
488 |
+
<td class="has-text-centered performance-strong">0.934</td>
|
489 |
+
<td class="has-text-centered performance-strong">0.944</td>
|
490 |
+
<td class="has-text-centered">0.014</td>
|
491 |
+
<td class="has-text-centered">0.028</td>
|
492 |
+
<td class="has-text-centered">0.019</td>
|
493 |
+
<td class="has-text-centered">0.010</td>
|
494 |
+
<td class="has-text-centered">0.399</td>
|
495 |
+
<td class="has-text-centered">0.400</td>
|
496 |
+
<td class="has-text-centered">0.400</td>
|
497 |
+
<td class="has-text-centered">0.393</td>
|
498 |
+
</tr>
|
499 |
+
<tr>
|
500 |
+
<td>OpenAI gpt-4o</td>
|
501 |
+
<td class="has-text-centered">0.778</td>
|
502 |
+
<td class="has-text-centered">0.760</td>
|
503 |
+
<td class="has-text-centered">0.766</td>
|
504 |
+
<td class="has-text-centered">0.911</td>
|
505 |
+
<td class="has-text-centered">0.402</td>
|
506 |
+
<td class="has-text-centered">0.445</td>
|
507 |
+
<td class="has-text-centered">0.402</td>
|
508 |
+
<td class="has-text-centered">0.399</td>
|
509 |
+
<td class="has-text-centered performance-medium">0.931</td>
|
510 |
+
<td class="has-text-centered performance-strong">0.955</td>
|
511 |
+
<td class="has-text-centered performance-medium">0.931</td>
|
512 |
+
<td class="has-text-centered performance-medium">0.942</td>
|
513 |
+
<td class="has-text-centered performance-medium">0.027</td>
|
514 |
+
<td class="has-text-centered">0.056</td>
|
515 |
+
<td class="has-text-centered">0.037</td>
|
516 |
+
<td class="has-text-centered">0.019</td>
|
517 |
+
<td class="has-text-centered">0.537</td>
|
518 |
+
<td class="has-text-centered">0.517</td>
|
519 |
+
<td class="has-text-centered">0.517</td>
|
520 |
+
<td class="has-text-centered">0.523</td>
|
521 |
+
</tr>
|
522 |
+
<tr>
|
523 |
+
<td>OpenAI o1-mini</td>
|
524 |
+
<td class="has-text-centered">0.772</td>
|
525 |
+
<td class="has-text-centered">0.755</td>
|
526 |
+
<td class="has-text-centered">0.761</td>
|
527 |
+
<td class="has-text-centered">0.922</td>
|
528 |
+
<td class="has-text-centered">0.407</td>
|
529 |
+
<td class="has-text-centered">0.444</td>
|
530 |
+
<td class="has-text-centered">0.407</td>
|
531 |
+
<td class="has-text-centered performance-medium">0.403</td>
|
532 |
+
<td class="has-text-centered">0.867</td>
|
533 |
+
<td class="has-text-centered">0.900</td>
|
534 |
+
<td class="has-text-centered">0.867</td>
|
535 |
+
<td class="has-text-centered">0.876</td>
|
536 |
+
<td class="has-text-centered">0.007</td>
|
537 |
+
<td class="has-text-centered">0.015</td>
|
538 |
+
<td class="has-text-centered">0.010</td>
|
539 |
+
<td class="has-text-centered">0.005</td>
|
540 |
+
<td class="has-text-centered performance-best">0.661</td>
|
541 |
+
<td class="has-text-centered performance-best">0.681</td>
|
542 |
+
<td class="has-text-centered performance-best">0.681</td>
|
543 |
+
<td class="has-text-centered performance-best">0.662</td>
|
544 |
+
</tr>
|
545 |
+
</tbody>
|
546 |
+
</table>
|
547 |
+
<div class="content is-small mt-4">
|
548 |
+
<p><strong>Note:</strong> Color highlighting indicates performance ranking:
|
549 |
+
<span class="performance-best"> Best </span>,
|
550 |
+
<span class="performance-medium"> Strong </span>,
|
551 |
+
<span class="performance-low"> Good </span>
|
552 |
+
</p>
|
553 |
+
</div>
|
554 |
+
</div>
|
555 |
+
</div>
|
manual_integration.md
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Manual Integration Instructions
|
2 |
+
|
3 |
+
Since we're having issues with shell scripts due to line ending compatibility, here's how to manually integrate the tables:
|
4 |
+
|
5 |
+
## 1. First, create a backup of your original results.html file
|
6 |
+
```bash
|
7 |
+
cp /mnt/c/Users/mikad/Documents/GitHub/Evals/results.html /mnt/c/Users/mikad/Documents/GitHub/Evals/results.html.backup
|
8 |
+
```
|
9 |
+
|
10 |
+
## 2. Integrating Text Classification Table
|
11 |
+
1. Open the results.html file in a text editor
|
12 |
+
2. Find the section `<div id="text-classification" class="tab-content">`
|
13 |
+
3. Replace everything from this line until the next `</div>` that closes this section with the content of text_classification_table.html
|
14 |
+
|
15 |
+
## 3. Integrating Text Summarization Table
|
16 |
+
1. Find the section `<div id="text-summarization" class="tab-content">`
|
17 |
+
2. Replace everything from this line until the next `</div>` that closes this section with the content of text_summarization_table.html
|
18 |
+
|
19 |
+
## 4. Adding Sentiment Analysis Table
|
20 |
+
1. Find the section `<div id="question-answering" class="tab-content">`
|
21 |
+
2. Find the closing `</div>` for this section (should be after the question answering table)
|
22 |
+
3. Insert the content of sentiment_analysis_table.html right after this closing div
|
23 |
+
|
24 |
+
## 5. Viewing the Results
|
25 |
+
After making these changes, save the file and open it in a web browser to verify all tables are displaying correctly.
|
26 |
+
|
27 |
+
## Alternative: Manual Copy/Paste Method
|
28 |
+
If you have access to a graphical text editor (like VS Code, Notepad++, etc.):
|
29 |
+
|
30 |
+
1. Open all files in the editor:
|
31 |
+
- results.html
|
32 |
+
- text_classification_table.html
|
33 |
+
- text_summarization_table.html
|
34 |
+
- sentiment_analysis_table.html
|
35 |
+
|
36 |
+
2. Copy the content from each table file and paste it into the appropriate section of results.html, replacing the placeholder content or adding it after the question-answering section.
|
qa_table.html
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- Question Answering -->
|
2 |
+
<div id="question-answering" class="tab-content">
|
3 |
+
<h2 class="title is-4">Question Answering Task Results</h2>
|
4 |
+
<div class="results-table">
|
5 |
+
<table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth">
|
6 |
+
<thead>
|
7 |
+
<tr>
|
8 |
+
<th rowspan="2">Model</th>
|
9 |
+
<th colspan="3" class="has-text-centered">Datasets (Accuracy)</th>
|
10 |
+
</tr>
|
11 |
+
<tr>
|
12 |
+
<th class="has-text-centered tooltip-trigger" data-tooltip="Large-scale dataset for numerical reasoning over financial data, consisting of 8,281 question-answer pairs from financial reports. Focuses on questions requiring interpretation of financial data and multi-step reasoning. Licensed under CC BY-NC 4.0.">FinQA</th>
|
13 |
+
<th class="has-text-centered tooltip-trigger" data-tooltip="Multi-turn question answering dataset with 3,892 conversations and 14,115 questions exploring chains of numerical reasoning in financial dialogues. Released under MIT License.">ConvFinQA</th>
|
14 |
+
<th class="has-text-centered tooltip-trigger" data-tooltip="Large-scale QA dataset for hybrid data sources (tables and text) from financial reports, emphasizing numerical reasoning operations. Licensed under CC BY 4.0.">TATQA</th>
|
15 |
+
</tr>
|
16 |
+
</thead>
|
17 |
+
<tbody>
|
18 |
+
<tr>
|
19 |
+
<td>Llama 3 70B Instruct</td>
|
20 |
+
<td class="has-text-centered">0.809</td>
|
21 |
+
<td class="has-text-centered">0.709</td>
|
22 |
+
<td class="has-text-centered">0.772</td>
|
23 |
+
</tr>
|
24 |
+
<tr>
|
25 |
+
<td>Llama 3 8B Instruct</td>
|
26 |
+
<td class="has-text-centered">0.767</td>
|
27 |
+
<td class="has-text-centered">0.268</td>
|
28 |
+
<td class="has-text-centered">0.706</td>
|
29 |
+
</tr>
|
30 |
+
<tr>
|
31 |
+
<td>DBRX Instruct</td>
|
32 |
+
<td class="has-text-centered">0.738</td>
|
33 |
+
<td class="has-text-centered">0.252</td>
|
34 |
+
<td class="has-text-centered">0.633</td>
|
35 |
+
</tr>
|
36 |
+
<tr>
|
37 |
+
<td>DeepSeek LLM (67B)</td>
|
38 |
+
<td class="has-text-centered">0.742</td>
|
39 |
+
<td class="has-text-centered">0.174</td>
|
40 |
+
<td class="has-text-centered">0.355</td>
|
41 |
+
</tr>
|
42 |
+
<tr>
|
43 |
+
<td>Gemma 2 27B</td>
|
44 |
+
<td class="has-text-centered">0.768</td>
|
45 |
+
<td class="has-text-centered">0.268</td>
|
46 |
+
<td class="has-text-centered">0.734</td>
|
47 |
+
</tr>
|
48 |
+
<tr>
|
49 |
+
<td>Gemma 2 9B</td>
|
50 |
+
<td class="has-text-centered">0.779</td>
|
51 |
+
<td class="has-text-centered">0.292</td>
|
52 |
+
<td class="has-text-centered">0.750</td>
|
53 |
+
</tr>
|
54 |
+
<tr>
|
55 |
+
<td>Mistral (7B) Instruct v0.3</td>
|
56 |
+
<td class="has-text-centered">0.655</td>
|
57 |
+
<td class="has-text-centered">0.199</td>
|
58 |
+
<td class="has-text-centered">0.553</td>
|
59 |
+
</tr>
|
60 |
+
<tr>
|
61 |
+
<td>Mixtral-8x22B Instruct</td>
|
62 |
+
<td class="has-text-centered">0.766</td>
|
63 |
+
<td class="has-text-centered">0.285</td>
|
64 |
+
<td class="has-text-centered">0.666</td>
|
65 |
+
</tr>
|
66 |
+
<tr>
|
67 |
+
<td>Mixtral-8x7B Instruct</td>
|
68 |
+
<td class="has-text-centered">0.611</td>
|
69 |
+
<td class="has-text-centered">0.315</td>
|
70 |
+
<td class="has-text-centered">0.501</td>
|
71 |
+
</tr>
|
72 |
+
<tr>
|
73 |
+
<td>Qwen 2 Instruct (72B)</td>
|
74 |
+
<td class="has-text-centered">0.819</td>
|
75 |
+
<td class="has-text-centered">0.269</td>
|
76 |
+
<td class="has-text-centered">0.715</td>
|
77 |
+
</tr>
|
78 |
+
<tr>
|
79 |
+
<td>WizardLM-2 8x22B</td>
|
80 |
+
<td class="has-text-centered">0.796</td>
|
81 |
+
<td class="has-text-centered">0.247</td>
|
82 |
+
<td class="has-text-centered">0.725</td>
|
83 |
+
</tr>
|
84 |
+
<tr>
|
85 |
+
<td>DeepSeek-V3</td>
|
86 |
+
<td class="has-text-centered performance-medium">0.840</td>
|
87 |
+
<td class="has-text-centered">0.261</td>
|
88 |
+
<td class="has-text-centered performance-low">0.779</td>
|
89 |
+
</tr>
|
90 |
+
<tr>
|
91 |
+
<td>DeepSeek R1</td>
|
92 |
+
<td class="has-text-centered performance-low">0.836</td>
|
93 |
+
<td class="has-text-centered performance-best">0.853</td>
|
94 |
+
<td class="has-text-centered performance-best">0.858</td>
|
95 |
+
</tr>
|
96 |
+
<tr>
|
97 |
+
<td>QwQ-32B-Preview</td>
|
98 |
+
<td class="has-text-centered">0.793</td>
|
99 |
+
<td class="has-text-centered">0.282</td>
|
100 |
+
<td class="has-text-centered performance-medium">0.796</td>
|
101 |
+
</tr>
|
102 |
+
<tr>
|
103 |
+
<td>Jamba 1.5 Mini</td>
|
104 |
+
<td class="has-text-centered">0.666</td>
|
105 |
+
<td class="has-text-centered">0.218</td>
|
106 |
+
<td class="has-text-centered">0.586</td>
|
107 |
+
</tr>
|
108 |
+
<tr>
|
109 |
+
<td>Jamba 1.5 Large</td>
|
110 |
+
<td class="has-text-centered">0.790</td>
|
111 |
+
<td class="has-text-centered">0.225</td>
|
112 |
+
<td class="has-text-centered">0.660</td>
|
113 |
+
</tr>
|
114 |
+
<tr>
|
115 |
+
<td>Claude 3.5 Sonnet</td>
|
116 |
+
<td class="has-text-centered performance-best">0.844</td>
|
117 |
+
<td class="has-text-centered">0.402</td>
|
118 |
+
<td class="has-text-centered">0.700</td>
|
119 |
+
</tr>
|
120 |
+
<tr>
|
121 |
+
<td>Claude 3 Haiku</td>
|
122 |
+
<td class="has-text-centered">0.803</td>
|
123 |
+
<td class="has-text-centered">0.421</td>
|
124 |
+
<td class="has-text-centered">0.733</td>
|
125 |
+
</tr>
|
126 |
+
<tr>
|
127 |
+
<td>Cohere Command R 7B</td>
|
128 |
+
<td class="has-text-centered">0.709</td>
|
129 |
+
<td class="has-text-centered">0.212</td>
|
130 |
+
<td class="has-text-centered">0.716</td>
|
131 |
+
</tr>
|
132 |
+
<tr>
|
133 |
+
<td>Cohere Command R +</td>
|
134 |
+
<td class="has-text-centered">0.776</td>
|
135 |
+
<td class="has-text-centered">0.259</td>
|
136 |
+
<td class="has-text-centered">0.698</td>
|
137 |
+
</tr>
|
138 |
+
<tr>
|
139 |
+
<td>Google Gemini 1.5 Pro</td>
|
140 |
+
<td class="has-text-centered">0.829</td>
|
141 |
+
<td class="has-text-centered">0.280</td>
|
142 |
+
<td class="has-text-centered">0.763</td>
|
143 |
+
</tr>
|
144 |
+
<tr>
|
145 |
+
<td>OpenAI gpt-4o</td>
|
146 |
+
<td class="has-text-centered performance-low">0.836</td>
|
147 |
+
<td class="has-text-centered performance-low">0.749</td>
|
148 |
+
<td class="has-text-centered">0.754</td>
|
149 |
+
</tr>
|
150 |
+
<tr>
|
151 |
+
<td>OpenAI o1-mini</td>
|
152 |
+
<td class="has-text-centered">0.799</td>
|
153 |
+
<td class="has-text-centered performance-medium">0.840</td>
|
154 |
+
<td class="has-text-centered">0.698</td>
|
155 |
+
</tr>
|
156 |
+
</tbody>
|
157 |
+
</table>
|
158 |
+
<div class="content is-small mt-4">
|
159 |
+
<p><strong>Note:</strong> Color highlighting indicates performance ranking:
|
160 |
+
<span class="performance-best"> Best </span>,
|
161 |
+
<span class="performance-medium"> Strong </span>,
|
162 |
+
<span class="performance-low"> Good </span>
|
163 |
+
</p>
|
164 |
+
</div>
|
165 |
+
</div>
|
166 |
+
</div>
|
results.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|
sentiment_analysis_table.html
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- Sentiment Analysis -->
|
2 |
+
<div id="sentiment-analysis" class="tab-content">
|
3 |
+
<h2 class="title is-4">Sentiment Analysis Task Results</h2>
|
4 |
+
<div class="results-table">
|
5 |
+
<table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth">
|
6 |
+
<thead>
|
7 |
+
<tr>
|
8 |
+
<th rowspan="2">Model</th>
|
9 |
+
<th colspan="3" class="has-text-centered tooltip-trigger" data-tooltip="FiQA Task 1 focuses on aspect-based financial sentiment analysis. Given a financial text, such as microblog posts or news headlines, systems predict sentiment scores on a continuous scale from -1 (negative) to 1 (positive). Evaluation metrics include MSE, MAE, and R-squared.">FiQA Task 1</th>
|
10 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="Financial Phrase Bank contains 4,840 sentences from English-language financial news articles, categorized as positive, negative, or neutral. Each sentence reflects the sentiment an investor might perceive regarding its influence on stock prices. Annotated by 16 finance experts using majority voting.">Financial Phrase Bank (FPB)</th>
|
11 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="Manually-annotated dataset focusing on subjectivity in Earnings Call Transcripts QA sessions. Includes 49,446 annotations across 2,747 QA pairs labeled on six subjectivity features: Assertive, Cautious, Optimistic, Specific, Clear, and Relevant.">SubjECTive-QA</th>
|
12 |
+
</tr>
|
13 |
+
<tr>
|
14 |
+
<th class="has-text-centered">MSE</th>
|
15 |
+
<th class="has-text-centered">MAE</th>
|
16 |
+
<th class="has-text-centered">r² Score</th>
|
17 |
+
<th class="has-text-centered">Accuracy</th>
|
18 |
+
<th class="has-text-centered">Precision</th>
|
19 |
+
<th class="has-text-centered">Recall</th>
|
20 |
+
<th class="has-text-centered">F1</th>
|
21 |
+
<th class="has-text-centered">Precision</th>
|
22 |
+
<th class="has-text-centered">Recall</th>
|
23 |
+
<th class="has-text-centered">F1</th>
|
24 |
+
<th class="has-text-centered">Accuracy</th>
|
25 |
+
</tr>
|
26 |
+
</thead>
|
27 |
+
<tbody>
|
28 |
+
<tr>
|
29 |
+
<td>Llama 3 70B Instruct</td>
|
30 |
+
<td class="has-text-centered">0.123</td>
|
31 |
+
<td class="has-text-centered">0.290</td>
|
32 |
+
<td class="has-text-centered">0.272</td>
|
33 |
+
<td class="has-text-centered">0.901</td>
|
34 |
+
<td class="has-text-centered">0.904</td>
|
35 |
+
<td class="has-text-centered">0.901</td>
|
36 |
+
<td class="has-text-centered">0.902</td>
|
37 |
+
<td class="has-text-centered">0.652</td>
|
38 |
+
<td class="has-text-centered">0.573</td>
|
39 |
+
<td class="has-text-centered">0.535</td>
|
40 |
+
<td class="has-text-centered">0.573</td>
|
41 |
+
</tr>
|
42 |
+
<tr>
|
43 |
+
<td>Llama 3 8B Instruct</td>
|
44 |
+
<td class="has-text-centered">0.161</td>
|
45 |
+
<td class="has-text-centered">0.344</td>
|
46 |
+
<td class="has-text-centered">0.045</td>
|
47 |
+
<td class="has-text-centered">0.738</td>
|
48 |
+
<td class="has-text-centered">0.801</td>
|
49 |
+
<td class="has-text-centered">0.738</td>
|
50 |
+
<td class="has-text-centered">0.698</td>
|
51 |
+
<td class="has-text-centered">0.635</td>
|
52 |
+
<td class="has-text-centered">0.625</td>
|
53 |
+
<td class="has-text-centered performance-best">0.600</td>
|
54 |
+
<td class="has-text-centered">0.625</td>
|
55 |
+
</tr>
|
56 |
+
<tr>
|
57 |
+
<td>DBRX Instruct</td>
|
58 |
+
<td class="has-text-centered">0.160</td>
|
59 |
+
<td class="has-text-centered">0.321</td>
|
60 |
+
<td class="has-text-centered">0.052</td>
|
61 |
+
<td class="has-text-centered">0.524</td>
|
62 |
+
<td class="has-text-centered">0.727</td>
|
63 |
+
<td class="has-text-centered">0.524</td>
|
64 |
+
<td class="has-text-centered">0.499</td>
|
65 |
+
<td class="has-text-centered">0.654</td>
|
66 |
+
<td class="has-text-centered">0.541</td>
|
67 |
+
<td class="has-text-centered">0.436</td>
|
68 |
+
<td class="has-text-centered">0.541</td>
|
69 |
+
</tr>
|
70 |
+
<tr>
|
71 |
+
<td>DeepSeek LLM (67B)</td>
|
72 |
+
<td class="has-text-centered">0.118</td>
|
73 |
+
<td class="has-text-centered">0.278</td>
|
74 |
+
<td class="has-text-centered">0.302</td>
|
75 |
+
<td class="has-text-centered">0.815</td>
|
76 |
+
<td class="has-text-centered">0.867</td>
|
77 |
+
<td class="has-text-centered">0.815</td>
|
78 |
+
<td class="has-text-centered">0.811</td>
|
79 |
+
<td class="has-text-centered">0.676</td>
|
80 |
+
<td class="has-text-centered">0.544</td>
|
81 |
+
<td class="has-text-centered">0.462</td>
|
82 |
+
<td class="has-text-centered">0.544</td>
|
83 |
+
</tr>
|
84 |
+
<tr>
|
85 |
+
<td>Gemma 2 27B</td>
|
86 |
+
<td class="has-text-centered performance-best">0.100</td>
|
87 |
+
<td class="has-text-centered performance-best">0.266</td>
|
88 |
+
<td class="has-text-centered">0.406</td>
|
89 |
+
<td class="has-text-centered">0.890</td>
|
90 |
+
<td class="has-text-centered">0.896</td>
|
91 |
+
<td class="has-text-centered">0.890</td>
|
92 |
+
<td class="has-text-centered">0.884</td>
|
93 |
+
<td class="has-text-centered">0.562</td>
|
94 |
+
<td class="has-text-centered">0.524</td>
|
95 |
+
<td class="has-text-centered">0.515</td>
|
96 |
+
<td class="has-text-centered">0.524</td>
|
97 |
+
</tr>
|
98 |
+
<tr>
|
99 |
+
<td>Gemma 2 9B</td>
|
100 |
+
<td class="has-text-centered">0.189</td>
|
101 |
+
<td class="has-text-centered">0.352</td>
|
102 |
+
<td class="has-text-centered">-0.120</td>
|
103 |
+
<td class="has-text-centered performance-strong">0.940</td>
|
104 |
+
<td class="has-text-centered performance-strong">0.941</td>
|
105 |
+
<td class="has-text-centered performance-strong">0.940</td>
|
106 |
+
<td class="has-text-centered performance-strong">0.940</td>
|
107 |
+
<td class="has-text-centered">0.570</td>
|
108 |
+
<td class="has-text-centered">0.499</td>
|
109 |
+
<td class="has-text-centered">0.491</td>
|
110 |
+
<td class="has-text-centered">0.499</td>
|
111 |
+
</tr>
|
112 |
+
<tr>
|
113 |
+
<td>Mistral (7B) Instruct v0.3</td>
|
114 |
+
<td class="has-text-centered">0.135</td>
|
115 |
+
<td class="has-text-centered">0.278</td>
|
116 |
+
<td class="has-text-centered">0.200</td>
|
117 |
+
<td class="has-text-centered">0.847</td>
|
118 |
+
<td class="has-text-centered">0.854</td>
|
119 |
+
<td class="has-text-centered">0.847</td>
|
120 |
+
<td class="has-text-centered">0.841</td>
|
121 |
+
<td class="has-text-centered">0.607</td>
|
122 |
+
<td class="has-text-centered">0.542</td>
|
123 |
+
<td class="has-text-centered">0.522</td>
|
124 |
+
<td class="has-text-centered">0.542</td>
|
125 |
+
</tr>
|
126 |
+
<tr>
|
127 |
+
<td>Mixtral-8x22B Instruct</td>
|
128 |
+
<td class="has-text-centered">0.221</td>
|
129 |
+
<td class="has-text-centered">0.364</td>
|
130 |
+
<td class="has-text-centered">-0.310</td>
|
131 |
+
<td class="has-text-centered">0.768</td>
|
132 |
+
<td class="has-text-centered">0.845</td>
|
133 |
+
<td class="has-text-centered">0.768</td>
|
134 |
+
<td class="has-text-centered">0.776</td>
|
135 |
+
<td class="has-text-centered">0.614</td>
|
136 |
+
<td class="has-text-centered">0.538</td>
|
137 |
+
<td class="has-text-centered">0.510</td>
|
138 |
+
<td class="has-text-centered">0.538</td>
|
139 |
+
</tr>
|
140 |
+
<tr>
|
141 |
+
<td>Mixtral-8x7B Instruct</td>
|
142 |
+
<td class="has-text-centered">0.208</td>
|
143 |
+
<td class="has-text-centered">0.307</td>
|
144 |
+
<td class="has-text-centered">-0.229</td>
|
145 |
+
<td class="has-text-centered">0.896</td>
|
146 |
+
<td class="has-text-centered">0.898</td>
|
147 |
+
<td class="has-text-centered">0.896</td>
|
148 |
+
<td class="has-text-centered">0.893</td>
|
149 |
+
<td class="has-text-centered">0.611</td>
|
150 |
+
<td class="has-text-centered">0.518</td>
|
151 |
+
<td class="has-text-centered">0.498</td>
|
152 |
+
<td class="has-text-centered">0.518</td>
|
153 |
+
</tr>
|
154 |
+
<tr>
|
155 |
+
<td>Qwen 2 Instruct (72B)</td>
|
156 |
+
<td class="has-text-centered">0.205</td>
|
157 |
+
<td class="has-text-centered">0.409</td>
|
158 |
+
<td class="has-text-centered">-0.212</td>
|
159 |
+
<td class="has-text-centered">0.904</td>
|
160 |
+
<td class="has-text-centered">0.908</td>
|
161 |
+
<td class="has-text-centered">0.904</td>
|
162 |
+
<td class="has-text-centered">0.901</td>
|
163 |
+
<td class="has-text-centered">0.644</td>
|
164 |
+
<td class="has-text-centered">0.601</td>
|
165 |
+
<td class="has-text-centered">0.576</td>
|
166 |
+
<td class="has-text-centered">0.601</td>
|
167 |
+
</tr>
|
168 |
+
<tr>
|
169 |
+
<td>WizardLM-2 8x22B</td>
|
170 |
+
<td class="has-text-centered">0.129</td>
|
171 |
+
<td class="has-text-centered">0.283</td>
|
172 |
+
<td class="has-text-centered">0.239</td>
|
173 |
+
<td class="has-text-centered">0.765</td>
|
174 |
+
<td class="has-text-centered">0.853</td>
|
175 |
+
<td class="has-text-centered">0.765</td>
|
176 |
+
<td class="has-text-centered">0.779</td>
|
177 |
+
<td class="has-text-centered">0.611</td>
|
178 |
+
<td class="has-text-centered">0.570</td>
|
179 |
+
<td class="has-text-centered">0.566</td>
|
180 |
+
<td class="has-text-centered">0.570</td>
|
181 |
+
</tr>
|
182 |
+
<tr>
|
183 |
+
<td>DeepSeek-V3</td>
|
184 |
+
<td class="has-text-centered">0.150</td>
|
185 |
+
<td class="has-text-centered">0.311</td>
|
186 |
+
<td class="has-text-centered">0.111</td>
|
187 |
+
<td class="has-text-centered">0.828</td>
|
188 |
+
<td class="has-text-centered">0.851</td>
|
189 |
+
<td class="has-text-centered">0.828</td>
|
190 |
+
<td class="has-text-centered">0.814</td>
|
191 |
+
<td class="has-text-centered">0.640</td>
|
192 |
+
<td class="has-text-centered">0.572</td>
|
193 |
+
<td class="has-text-centered performance-medium">0.583</td>
|
194 |
+
<td class="has-text-centered">0.572</td>
|
195 |
+
</tr>
|
196 |
+
<tr>
|
197 |
+
<td>DeepSeek R1</td>
|
198 |
+
<td class="has-text-centered performance-low">0.110</td>
|
199 |
+
<td class="has-text-centered">0.289</td>
|
200 |
+
<td class="has-text-centered">0.348</td>
|
201 |
+
<td class="has-text-centered">0.904</td>
|
202 |
+
<td class="has-text-centered">0.907</td>
|
203 |
+
<td class="has-text-centered">0.904</td>
|
204 |
+
<td class="has-text-centered">0.902</td>
|
205 |
+
<td class="has-text-centered">0.644</td>
|
206 |
+
<td class="has-text-centered">0.489</td>
|
207 |
+
<td class="has-text-centered">0.499</td>
|
208 |
+
<td class="has-text-centered">0.489</td>
|
209 |
+
</tr>
|
210 |
+
<tr>
|
211 |
+
<td>QwQ-32B-Preview</td>
|
212 |
+
<td class="has-text-centered">0.141</td>
|
213 |
+
<td class="has-text-centered">0.290</td>
|
214 |
+
<td class="has-text-centered">0.165</td>
|
215 |
+
<td class="has-text-centered">0.812</td>
|
216 |
+
<td class="has-text-centered">0.827</td>
|
217 |
+
<td class="has-text-centered">0.812</td>
|
218 |
+
<td class="has-text-centered">0.815</td>
|
219 |
+
<td class="has-text-centered">0.629</td>
|
220 |
+
<td class="has-text-centered">0.534</td>
|
221 |
+
<td class="has-text-centered">0.550</td>
|
222 |
+
<td class="has-text-centered">0.534</td>
|
223 |
+
</tr>
|
224 |
+
<tr>
|
225 |
+
<td>Jamba 1.5 Mini</td>
|
226 |
+
<td class="has-text-centered performance-low">0.119</td>
|
227 |
+
<td class="has-text-centered">0.282</td>
|
228 |
+
<td class="has-text-centered">0.293</td>
|
229 |
+
<td class="has-text-centered">0.784</td>
|
230 |
+
<td class="has-text-centered">0.814</td>
|
231 |
+
<td class="has-text-centered">0.784</td>
|
232 |
+
<td class="has-text-centered">0.765</td>
|
233 |
+
<td class="has-text-centered">0.380</td>
|
234 |
+
<td class="has-text-centered">0.525</td>
|
235 |
+
<td class="has-text-centered">0.418</td>
|
236 |
+
<td class="has-text-centered">0.525</td>
|
237 |
+
</tr>
|
238 |
+
<tr>
|
239 |
+
<td>Jamba 1.5 Large</td>
|
240 |
+
<td class="has-text-centered">0.183</td>
|
241 |
+
<td class="has-text-centered">0.363</td>
|
242 |
+
<td class="has-text-centered">-0.085</td>
|
243 |
+
<td class="has-text-centered">0.824</td>
|
244 |
+
<td class="has-text-centered">0.850</td>
|
245 |
+
<td class="has-text-centered">0.824</td>
|
246 |
+
<td class="has-text-centered">0.798</td>
|
247 |
+
<td class="has-text-centered">0.635</td>
|
248 |
+
<td class="has-text-centered">0.573</td>
|
249 |
+
<td class="has-text-centered performance-medium">0.582</td>
|
250 |
+
<td class="has-text-centered">0.573</td>
|
251 |
+
</tr>
|
252 |
+
<tr>
|
253 |
+
<td>Claude 3.5 Sonnet</td>
|
254 |
+
<td class="has-text-centered performance-low">0.101</td>
|
255 |
+
<td class="has-text-centered performance-low">0.268</td>
|
256 |
+
<td class="has-text-centered performance-best">0.402</td>
|
257 |
+
<td class="has-text-centered performance-best">0.944</td>
|
258 |
+
<td class="has-text-centered performance-best">0.945</td>
|
259 |
+
<td class="has-text-centered performance-best">0.944</td>
|
260 |
+
<td class="has-text-centered performance-best">0.944</td>
|
261 |
+
<td class="has-text-centered">0.634</td>
|
262 |
+
<td class="has-text-centered performance-medium">0.585</td>
|
263 |
+
<td class="has-text-centered">0.553</td>
|
264 |
+
<td class="has-text-centered performance-medium">0.585</td>
|
265 |
+
</tr>
|
266 |
+
<tr>
|
267 |
+
<td>Claude 3 Haiku</td>
|
268 |
+
<td class="has-text-centered">0.167</td>
|
269 |
+
<td class="has-text-centered">0.349</td>
|
270 |
+
<td class="has-text-centered">0.008</td>
|
271 |
+
<td class="has-text-centered">0.907</td>
|
272 |
+
<td class="has-text-centered">0.913</td>
|
273 |
+
<td class="has-text-centered">0.907</td>
|
274 |
+
<td class="has-text-centered">0.908</td>
|
275 |
+
<td class="has-text-centered">0.619</td>
|
276 |
+
<td class="has-text-centered">0.538</td>
|
277 |
+
<td class="has-text-centered">0.463</td>
|
278 |
+
<td class="has-text-centered">0.538</td>
|
279 |
+
</tr>
|
280 |
+
<tr>
|
281 |
+
<td>Cohere Command R 7B</td>
|
282 |
+
<td class="has-text-centered">0.164</td>
|
283 |
+
<td class="has-text-centered">0.319</td>
|
284 |
+
<td class="has-text-centered">0.028</td>
|
285 |
+
<td class="has-text-centered">0.835</td>
|
286 |
+
<td class="has-text-centered">0.861</td>
|
287 |
+
<td class="has-text-centered">0.835</td>
|
288 |
+
<td class="has-text-centered">0.840</td>
|
289 |
+
<td class="has-text-centered">0.609</td>
|
290 |
+
<td class="has-text-centered">0.547</td>
|
291 |
+
<td class="has-text-centered">0.532</td>
|
292 |
+
<td class="has-text-centered">0.547</td>
|
293 |
+
</tr>
|
294 |
+
<tr>
|
295 |
+
<td>Cohere Command R +</td>
|
296 |
+
<td class="has-text-centered performance-medium">0.106</td>
|
297 |
+
<td class="has-text-centered">0.274</td>
|
298 |
+
<td class="has-text-centered performance-medium">0.373</td>
|
299 |
+
<td class="has-text-centered">0.741</td>
|
300 |
+
<td class="has-text-centered">0.806</td>
|
301 |
+
<td class="has-text-centered">0.741</td>
|
302 |
+
<td class="has-text-centered">0.699</td>
|
303 |
+
<td class="has-text-centered">0.608</td>
|
304 |
+
<td class="has-text-centered">0.547</td>
|
305 |
+
<td class="has-text-centered">0.533</td>
|
306 |
+
<td class="has-text-centered">0.547</td>
|
307 |
+
</tr>
|
308 |
+
<tr>
|
309 |
+
<td>Google Gemini 1.5 Pro</td>
|
310 |
+
<td class="has-text-centered">0.144</td>
|
311 |
+
<td class="has-text-centered">0.329</td>
|
312 |
+
<td class="has-text-centered">0.149</td>
|
313 |
+
<td class="has-text-centered">0.890</td>
|
314 |
+
<td class="has-text-centered">0.895</td>
|
315 |
+
<td class="has-text-centered">0.890</td>
|
316 |
+
<td class="has-text-centered">0.885</td>
|
317 |
+
<td class="has-text-centered">0.642</td>
|
318 |
+
<td class="has-text-centered performance-medium">0.587</td>
|
319 |
+
<td class="has-text-centered performance-best">0.593</td>
|
320 |
+
<td class="has-text-centered performance-best">0.587</td>
|
321 |
+
</tr>
|
322 |
+
<tr>
|
323 |
+
<td>OpenAI gpt-4o</td>
|
324 |
+
<td class="has-text-centered">0.184</td>
|
325 |
+
<td class="has-text-centered">0.317</td>
|
326 |
+
<td class="has-text-centered">-0.089</td>
|
327 |
+
<td class="has-text-centered">0.929</td>
|
328 |
+
<td class="has-text-centered">0.931</td>
|
329 |
+
<td class="has-text-centered">0.929</td>
|
330 |
+
<td class="has-text-centered">0.928</td>
|
331 |
+
<td class="has-text-centered">0.639</td>
|
332 |
+
<td class="has-text-centered">0.515</td>
|
333 |
+
<td class="has-text-centered">0.541</td>
|
334 |
+
<td class="has-text-centered">0.515</td>
|
335 |
+
</tr>
|
336 |
+
<tr>
|
337 |
+
<td>OpenAI o1-mini</td>
|
338 |
+
<td class="has-text-centered performance-medium">0.120</td>
|
339 |
+
<td class="has-text-centered">0.295</td>
|
340 |
+
<td class="has-text-centered">0.289</td>
|
341 |
+
<td class="has-text-centered">0.918</td>
|
342 |
+
<td class="has-text-centered">0.917</td>
|
343 |
+
<td class="has-text-centered">0.918</td>
|
344 |
+
<td class="has-text-centered">0.917</td>
|
345 |
+
<td class="has-text-centered performance-best">0.660</td>
|
346 |
+
<td class="has-text-centered">0.515</td>
|
347 |
+
<td class="has-text-centered">0.542</td>
|
348 |
+
<td class="has-text-centered">0.515</td>
|
349 |
+
</tr>
|
350 |
+
</tbody>
|
351 |
+
</table>
|
352 |
+
<div class="content is-small mt-4">
|
353 |
+
<p><strong>Note:</strong> Color highlighting indicates performance ranking:
|
354 |
+
<span class="performance-best"> Best </span>,
|
355 |
+
<span class="performance-medium"> Strong </span>,
|
356 |
+
<span class="performance-low"> Good </span>
|
357 |
+
</p>
|
358 |
+
</div>
|
359 |
+
</div>
|
360 |
+
</div>
|
static/css/index.css
CHANGED
@@ -234,6 +234,7 @@ body {
|
|
234 |
.content ul li {
|
235 |
margin-bottom: 0.5rem;
|
236 |
position: relative;
|
|
|
237 |
padding-left: 1.5rem;
|
238 |
}
|
239 |
|
@@ -558,4 +559,54 @@ figcaption {
|
|
558 |
.card {
|
559 |
margin-bottom: 1rem;
|
560 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
561 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
.content ul li {
|
235 |
margin-bottom: 0.5rem;
|
236 |
position: relative;
|
237 |
+
list-style-type: none;
|
238 |
padding-left: 1.5rem;
|
239 |
}
|
240 |
|
|
|
559 |
.card {
|
560 |
margin-bottom: 1rem;
|
561 |
}
|
562 |
+
|
563 |
+
.tabs ul {
|
564 |
+
flex-wrap: wrap;
|
565 |
+
}
|
566 |
+
}
|
567 |
+
|
568 |
+
/* Tabbed Results */
|
569 |
+
.results-table-container {
|
570 |
+
background-color: white;
|
571 |
+
border-radius: 8px;
|
572 |
+
box-shadow: 0 4px 10px rgba(0,0,0,0.1);
|
573 |
+
padding: 1rem;
|
574 |
+
margin-bottom: 2rem;
|
575 |
+
}
|
576 |
+
|
577 |
+
.tab-content {
|
578 |
+
padding: 1rem 0;
|
579 |
+
}
|
580 |
+
|
581 |
+
.tab-pane {
|
582 |
+
display: none;
|
583 |
+
}
|
584 |
+
|
585 |
+
.tab-pane.is-active {
|
586 |
+
display: block;
|
587 |
+
}
|
588 |
+
|
589 |
+
.table-container {
|
590 |
+
overflow-x: auto;
|
591 |
+
max-width: 100%;
|
592 |
+
}
|
593 |
+
|
594 |
+
/* Highlight cells for top performers */
|
595 |
+
.highlight-1 {
|
596 |
+
background-color: rgba(72, 199, 116, 0.7) !important; /* 1st place - green!70 */
|
597 |
+
font-weight: bold;
|
598 |
+
color: white;
|
599 |
+
}
|
600 |
+
|
601 |
+
.highlight-2 {
|
602 |
+
background-color: rgba(72, 199, 116, 0.5) !important; /* 2nd place - green!50 */
|
603 |
+
font-weight: bold;
|
604 |
}
|
605 |
+
|
606 |
+
.highlight-3 {
|
607 |
+
background-color: rgba(72, 199, 116, 0.2) !important; /* 3rd place - green!20 */
|
608 |
+
}
|
609 |
+
|
610 |
+
.legend {
|
611 |
+
font-size: 0.8rem;
|
612 |
+
}
|
static/css/results.css
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.tab-content {
|
2 |
+
display: none;
|
3 |
+
}
|
4 |
+
#main {
|
5 |
+
display: block;
|
6 |
+
}
|
7 |
+
.tab-container {
|
8 |
+
margin-bottom: 2rem;
|
9 |
+
}
|
10 |
+
.results-table {
|
11 |
+
width: 100%;
|
12 |
+
overflow-x: auto;
|
13 |
+
}
|
14 |
+
.table-container {
|
15 |
+
padding: 1.5rem;
|
16 |
+
background-color: white;
|
17 |
+
border-radius: 6px;
|
18 |
+
box-shadow: 0 2px 3px rgba(10, 10, 10, 0.1);
|
19 |
+
}
|
20 |
+
.results-title {
|
21 |
+
margin-bottom: 1.5rem;
|
22 |
+
color: #004d99;
|
23 |
+
}
|
24 |
+
.navbar {
|
25 |
+
background-color: white;
|
26 |
+
box-shadow: 0 2px 3px rgba(10, 10, 10, 0.1);
|
27 |
+
}
|
28 |
+
.flame {
|
29 |
+
font-weight: bold;
|
30 |
+
color: #ff6b00;
|
31 |
+
}
|
32 |
+
body {
|
33 |
+
padding-top: 52px;
|
34 |
+
}
|
35 |
+
.tooltip-trigger {
|
36 |
+
position: relative;
|
37 |
+
cursor: help;
|
38 |
+
}
|
39 |
+
.tooltip-trigger:hover::after {
|
40 |
+
content: attr(data-tooltip);
|
41 |
+
position: absolute;
|
42 |
+
bottom: 100%;
|
43 |
+
left: 50%;
|
44 |
+
transform: translateX(-50%);
|
45 |
+
background-color: rgba(0, 0, 0, 0.85);
|
46 |
+
color: white;
|
47 |
+
padding: 8px 12px;
|
48 |
+
border-radius: 4px;
|
49 |
+
font-size: 0.8rem;
|
50 |
+
white-space: normal;
|
51 |
+
max-width: 300px;
|
52 |
+
z-index: 100;
|
53 |
+
text-align: left;
|
54 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.2);
|
55 |
+
}
|
56 |
+
|
57 |
+
/* Table borders */
|
58 |
+
.column-border-right {
|
59 |
+
border-right: 2px solid #dbdbdb;
|
60 |
+
}
|
61 |
+
.column-border-left {
|
62 |
+
border-left: 2px solid #ccc;
|
63 |
+
}
|
64 |
+
.row-border-bottom {
|
65 |
+
border-bottom: 2px solid #999;
|
66 |
+
}
|
67 |
+
|
68 |
+
/* Performance highlighting colors */
|
69 |
+
.performance-best {
|
70 |
+
background-color: #48c774 !important;
|
71 |
+
}
|
72 |
+
.performance-strong {
|
73 |
+
background-color: #b5f2c3 !important;
|
74 |
+
}
|
75 |
+
|
76 |
+
/* Legend styles */
|
77 |
+
.performance-legend {
|
78 |
+
margin-top: 1rem;
|
79 |
+
}
|
80 |
+
.performance-legend-item {
|
81 |
+
display: inline-block;
|
82 |
+
padding: 0 0.5rem;
|
83 |
+
margin-right: 0.5rem;
|
84 |
+
}
|
static/js/index.js
CHANGED
@@ -76,3 +76,30 @@ $(document).ready(function() {
|
|
76 |
bulmaSlider.attach();
|
77 |
|
78 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
bulmaSlider.attach();
|
77 |
|
78 |
})
|
79 |
+
|
80 |
+
// JavaScript for Tab Switching in Results Section
|
81 |
+
document.addEventListener('DOMContentLoaded', function() {
|
82 |
+
// Get all tab elements
|
83 |
+
const tabs = document.querySelectorAll('#results-tabs li');
|
84 |
+
|
85 |
+
// Add click event to each tab
|
86 |
+
tabs.forEach(tab => {
|
87 |
+
tab.addEventListener('click', function() {
|
88 |
+
// Remove active class from all tabs
|
89 |
+
tabs.forEach(t => t.classList.remove('is-active'));
|
90 |
+
|
91 |
+
// Add active class to clicked tab
|
92 |
+
this.classList.add('is-active');
|
93 |
+
|
94 |
+
// Get target tab pane
|
95 |
+
const targetId = this.getAttribute('data-tab');
|
96 |
+
const tabPanes = document.querySelectorAll('.tab-pane');
|
97 |
+
|
98 |
+
// Hide all tab panes
|
99 |
+
tabPanes.forEach(pane => pane.classList.remove('is-active'));
|
100 |
+
|
101 |
+
// Show target tab pane
|
102 |
+
document.getElementById(targetId).classList.add('is-active');
|
103 |
+
});
|
104 |
+
});
|
105 |
+
});
|
static/js/results.js
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
document.addEventListener('DOMContentLoaded', () => {
|
2 |
+
// Tab switching functionality
|
3 |
+
const tabs = document.querySelectorAll('.tabs li');
|
4 |
+
const tabContents = document.querySelectorAll('.tab-content');
|
5 |
+
|
6 |
+
tabs.forEach(tab => {
|
7 |
+
tab.addEventListener('click', () => {
|
8 |
+
// Deactivate all tabs
|
9 |
+
tabs.forEach(t => t.classList.remove('is-active'));
|
10 |
+
tabContents.forEach(tc => tc.style.display = 'none');
|
11 |
+
|
12 |
+
// Activate current tab
|
13 |
+
tab.classList.add('is-active');
|
14 |
+
const tabId = tab.getAttribute('data-tab');
|
15 |
+
document.getElementById(tabId).style.display = 'block';
|
16 |
+
});
|
17 |
+
});
|
18 |
+
|
19 |
+
// Get all "navbar-burger" elements
|
20 |
+
const $navbarBurgers = Array.prototype.slice.call(document.querySelectorAll('.navbar-burger'), 0);
|
21 |
+
|
22 |
+
// Add a click event on each of them
|
23 |
+
$navbarBurgers.forEach(el => {
|
24 |
+
el.addEventListener('click', () => {
|
25 |
+
// Get the target from the "data-target" attribute
|
26 |
+
const target = el.dataset.target;
|
27 |
+
const $target = document.getElementById(target);
|
28 |
+
|
29 |
+
// Toggle the "is-active" class on both the "navbar-burger" and the "navbar-menu"
|
30 |
+
el.classList.toggle('is-active');
|
31 |
+
$target.classList.toggle('is-active');
|
32 |
+
});
|
33 |
+
});
|
34 |
+
});
|
text_classification_table.html
ADDED
@@ -0,0 +1,506 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- Text Classification -->
|
2 |
+
<div id="text-classification" class="tab-content">
|
3 |
+
<h2 class="title is-4">Text Classification Task Results</h2>
|
4 |
+
<div class="results-table">
|
5 |
+
<table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth">
|
6 |
+
<thead>
|
7 |
+
<tr>
|
8 |
+
<th rowspan="2">Model</th>
|
9 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="A fine-grained dataset designed for intent detection within the banking domain, comprising 13,083 customer service queries annotated with 77 unique intents.">Banking77</th>
|
10 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="A dataset designed to evaluate machine learning models using tabular data and profile text inputs for financial risk prediction, covering default, fraud, and churn with 333,000 labeled instances.">FinBench</th>
|
11 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="A dataset of Federal Open Market Committee speeches, meeting minutes, and press conference transcripts (1996-2022) for hawkish-dovish classification of monetary policy stance.">FOMC</th>
|
12 |
+
<th colspan="4" class="has-text-centered tooltip-trigger" data-tooltip="An expert-annotated dataset for detecting fine-grained investor claims within financial narratives, focusing on numerals in analyst reports and earnings call transcripts.">NumClaim</th>
|
13 |
+
<th colspan="1" class="has-text-centered tooltip-trigger" data-tooltip="A dataset of 11,412 human-annotated financial news headlines focused on commodities (particularly gold), spanning 2000-2019, with binary indicators for price mentions and movements.">Headlines</th>
|
14 |
+
</tr>
|
15 |
+
<tr>
|
16 |
+
<th class="has-text-centered">Accuracy</th>
|
17 |
+
<th class="has-text-centered">Precision</th>
|
18 |
+
<th class="has-text-centered">Recall</th>
|
19 |
+
<th class="has-text-centered">F1</th>
|
20 |
+
<th class="has-text-centered">Accuracy</th>
|
21 |
+
<th class="has-text-centered">Precision</th>
|
22 |
+
<th class="has-text-centered">Recall</th>
|
23 |
+
<th class="has-text-centered">F1</th>
|
24 |
+
<th class="has-text-centered">Accuracy</th>
|
25 |
+
<th class="has-text-centered">Precision</th>
|
26 |
+
<th class="has-text-centered">Recall</th>
|
27 |
+
<th class="has-text-centered">F1</th>
|
28 |
+
<th class="has-text-centered">Accuracy</th>
|
29 |
+
<th class="has-text-centered">Precision</th>
|
30 |
+
<th class="has-text-centered">Recall</th>
|
31 |
+
<th class="has-text-centered">F1</th>
|
32 |
+
<th class="has-text-centered">Accuracy</th>
|
33 |
+
</tr>
|
34 |
+
</thead>
|
35 |
+
<tbody>
|
36 |
+
<tr>
|
37 |
+
<td>Llama 3 70B Instruct</td>
|
38 |
+
<td class="has-text-centered">0.660</td>
|
39 |
+
<td class="has-text-centered">0.748</td>
|
40 |
+
<td class="has-text-centered">0.660</td>
|
41 |
+
<td class="has-text-centered">0.645</td>
|
42 |
+
<td class="has-text-centered">0.222</td>
|
43 |
+
<td class="has-text-centered">0.826</td>
|
44 |
+
<td class="has-text-centered">0.222</td>
|
45 |
+
<td class="has-text-centered">0.309</td>
|
46 |
+
<td class="has-text-centered">0.661</td>
|
47 |
+
<td class="has-text-centered">0.662</td>
|
48 |
+
<td class="has-text-centered">0.661</td>
|
49 |
+
<td class="has-text-centered">0.652</td>
|
50 |
+
<td class="has-text-centered">0.430</td>
|
51 |
+
<td class="has-text-centered">0.240</td>
|
52 |
+
<td class="has-text-centered performance-medium">0.980</td>
|
53 |
+
<td class="has-text-centered">0.386</td>
|
54 |
+
<td class="has-text-centered">0.811</td>
|
55 |
+
</tr>
|
56 |
+
<tr>
|
57 |
+
<td>Llama 3 8B Instruct</td>
|
58 |
+
<td class="has-text-centered">0.534</td>
|
59 |
+
<td class="has-text-centered">0.672</td>
|
60 |
+
<td class="has-text-centered">0.534</td>
|
61 |
+
<td class="has-text-centered">0.512</td>
|
62 |
+
<td class="has-text-centered">0.543</td>
|
63 |
+
<td class="has-text-centered">0.857</td>
|
64 |
+
<td class="has-text-centered">0.543</td>
|
65 |
+
<td class="has-text-centered">0.659</td>
|
66 |
+
<td class="has-text-centered">0.565</td>
|
67 |
+
<td class="has-text-centered">0.618</td>
|
68 |
+
<td class="has-text-centered">0.565</td>
|
69 |
+
<td class="has-text-centered">0.497</td>
|
70 |
+
<td class="has-text-centered">0.801</td>
|
71 |
+
<td class="has-text-centered">0.463</td>
|
72 |
+
<td class="has-text-centered">0.571</td>
|
73 |
+
<td class="has-text-centered">0.511</td>
|
74 |
+
<td class="has-text-centered">0.763</td>
|
75 |
+
</tr>
|
76 |
+
<tr>
|
77 |
+
<td>DBRX Instruct</td>
|
78 |
+
<td class="has-text-centered">0.578</td>
|
79 |
+
<td class="has-text-centered">0.706</td>
|
80 |
+
<td class="has-text-centered">0.578</td>
|
81 |
+
<td class="has-text-centered">0.574</td>
|
82 |
+
<td class="has-text-centered">0.359</td>
|
83 |
+
<td class="has-text-centered">0.851</td>
|
84 |
+
<td class="has-text-centered">0.359</td>
|
85 |
+
<td class="has-text-centered">0.483</td>
|
86 |
+
<td class="has-text-centered">0.285</td>
|
87 |
+
<td class="has-text-centered">0.572</td>
|
88 |
+
<td class="has-text-centered">0.285</td>
|
89 |
+
<td class="has-text-centered">0.193</td>
|
90 |
+
<td class="has-text-centered">0.222</td>
|
91 |
+
<td class="has-text-centered">0.190</td>
|
92 |
+
<td class="has-text-centered performance-best">1.000</td>
|
93 |
+
<td class="has-text-centered">0.319</td>
|
94 |
+
<td class="has-text-centered">0.746</td>
|
95 |
+
</tr>
|
96 |
+
<tr>
|
97 |
+
<td>DeepSeek LLM (67B)</td>
|
98 |
+
<td class="has-text-centered">0.596</td>
|
99 |
+
<td class="has-text-centered">0.711</td>
|
100 |
+
<td class="has-text-centered">0.596</td>
|
101 |
+
<td class="has-text-centered">0.578</td>
|
102 |
+
<td class="has-text-centered">0.369</td>
|
103 |
+
<td class="has-text-centered">0.856</td>
|
104 |
+
<td class="has-text-centered">0.369</td>
|
105 |
+
<td class="has-text-centered">0.492</td>
|
106 |
+
<td class="has-text-centered">0.532</td>
|
107 |
+
<td class="has-text-centered">0.678</td>
|
108 |
+
<td class="has-text-centered">0.532</td>
|
109 |
+
<td class="has-text-centered">0.407</td>
|
110 |
+
<td class="has-text-centered">0.832</td>
|
111 |
+
<td class="has-text-centered performance-best">1.000</td>
|
112 |
+
<td class="has-text-centered">0.082</td>
|
113 |
+
<td class="has-text-centered">0.151</td>
|
114 |
+
<td class="has-text-centered">0.778</td>
|
115 |
+
</tr>
|
116 |
+
<tr>
|
117 |
+
<td>Gemma 2 27B</td>
|
118 |
+
<td class="has-text-centered">0.639</td>
|
119 |
+
<td class="has-text-centered">0.730</td>
|
120 |
+
<td class="has-text-centered">0.639</td>
|
121 |
+
<td class="has-text-centered">0.621</td>
|
122 |
+
<td class="has-text-centered">0.410</td>
|
123 |
+
<td class="has-text-centered">0.849</td>
|
124 |
+
<td class="has-text-centered">0.410</td>
|
125 |
+
<td class="has-text-centered">0.538</td>
|
126 |
+
<td class="has-text-centered">0.651</td>
|
127 |
+
<td class="has-text-centered">0.704</td>
|
128 |
+
<td class="has-text-centered">0.651</td>
|
129 |
+
<td class="has-text-centered">0.620</td>
|
130 |
+
<td class="has-text-centered">0.471</td>
|
131 |
+
<td class="has-text-centered">0.257</td>
|
132 |
+
<td class="has-text-centered performance-best">1.000</td>
|
133 |
+
<td class="has-text-centered">0.408</td>
|
134 |
+
<td class="has-text-centered">0.808</td>
|
135 |
+
</tr>
|
136 |
+
<tr>
|
137 |
+
<td>Gemma 2 9B</td>
|
138 |
+
<td class="has-text-centered">0.630</td>
|
139 |
+
<td class="has-text-centered">0.710</td>
|
140 |
+
<td class="has-text-centered">0.630</td>
|
141 |
+
<td class="has-text-centered">0.609</td>
|
142 |
+
<td class="has-text-centered">0.412</td>
|
143 |
+
<td class="has-text-centered">0.848</td>
|
144 |
+
<td class="has-text-centered">0.412</td>
|
145 |
+
<td class="has-text-centered">0.541</td>
|
146 |
+
<td class="has-text-centered">0.595</td>
|
147 |
+
<td class="has-text-centered">0.694</td>
|
148 |
+
<td class="has-text-centered">0.595</td>
|
149 |
+
<td class="has-text-centered">0.519</td>
|
150 |
+
<td class="has-text-centered">0.371</td>
|
151 |
+
<td class="has-text-centered">0.224</td>
|
152 |
+
<td class="has-text-centered performance-strong">0.990</td>
|
153 |
+
<td class="has-text-centered">0.365</td>
|
154 |
+
<td class="has-text-centered performance-best">0.856</td>
|
155 |
+
</tr>
|
156 |
+
<tr>
|
157 |
+
<td>Mistral (7B) Instruct v0.3</td>
|
158 |
+
<td class="has-text-centered">0.547</td>
|
159 |
+
<td class="has-text-centered">0.677</td>
|
160 |
+
<td class="has-text-centered">0.547</td>
|
161 |
+
<td class="has-text-centered">0.528</td>
|
162 |
+
<td class="has-text-centered">0.375</td>
|
163 |
+
<td class="has-text-centered">0.839</td>
|
164 |
+
<td class="has-text-centered">0.375</td>
|
165 |
+
<td class="has-text-centered">0.503</td>
|
166 |
+
<td class="has-text-centered">0.587</td>
|
167 |
+
<td class="has-text-centered">0.598</td>
|
168 |
+
<td class="has-text-centered">0.587</td>
|
169 |
+
<td class="has-text-centered">0.542</td>
|
170 |
+
<td class="has-text-centered">0.521</td>
|
171 |
+
<td class="has-text-centered">0.266</td>
|
172 |
+
<td class="has-text-centered">0.918</td>
|
173 |
+
<td class="has-text-centered">0.412</td>
|
174 |
+
<td class="has-text-centered">0.779</td>
|
175 |
+
</tr>
|
176 |
+
<tr>
|
177 |
+
<td>Mixtral-8x22B Instruct</td>
|
178 |
+
<td class="has-text-centered">0.622</td>
|
179 |
+
<td class="has-text-centered">0.718</td>
|
180 |
+
<td class="has-text-centered">0.622</td>
|
181 |
+
<td class="has-text-centered">0.602</td>
|
182 |
+
<td class="has-text-centered">0.166</td>
|
183 |
+
<td class="has-text-centered">0.811</td>
|
184 |
+
<td class="has-text-centered">0.166</td>
|
185 |
+
<td class="has-text-centered">0.221</td>
|
186 |
+
<td class="has-text-centered">0.562</td>
|
187 |
+
<td class="has-text-centered">0.709</td>
|
188 |
+
<td class="has-text-centered">0.562</td>
|
189 |
+
<td class="has-text-centered">0.465</td>
|
190 |
+
<td class="has-text-centered">0.732</td>
|
191 |
+
<td class="has-text-centered">0.384</td>
|
192 |
+
<td class="has-text-centered">0.775</td>
|
193 |
+
<td class="has-text-centered">0.513</td>
|
194 |
+
<td class="has-text-centered performance-medium">0.835</td>
|
195 |
+
</tr>
|
196 |
+
<tr>
|
197 |
+
<td>Mixtral-8x7B Instruct</td>
|
198 |
+
<td class="has-text-centered">0.567</td>
|
199 |
+
<td class="has-text-centered">0.693</td>
|
200 |
+
<td class="has-text-centered">0.567</td>
|
201 |
+
<td class="has-text-centered">0.547</td>
|
202 |
+
<td class="has-text-centered">0.285</td>
|
203 |
+
<td class="has-text-centered">0.838</td>
|
204 |
+
<td class="has-text-centered">0.285</td>
|
205 |
+
<td class="has-text-centered">0.396</td>
|
206 |
+
<td class="has-text-centered">0.623</td>
|
207 |
+
<td class="has-text-centered">0.636</td>
|
208 |
+
<td class="has-text-centered">0.623</td>
|
209 |
+
<td class="has-text-centered">0.603</td>
|
210 |
+
<td class="has-text-centered">0.765</td>
|
211 |
+
<td class="has-text-centered">0.431</td>
|
212 |
+
<td class="has-text-centered">0.898</td>
|
213 |
+
<td class="has-text-centered">0.583</td>
|
214 |
+
<td class="has-text-centered">0.805</td>
|
215 |
+
</tr>
|
216 |
+
<tr>
|
217 |
+
<td>Qwen 2 Instruct (72B)</td>
|
218 |
+
<td class="has-text-centered">0.644</td>
|
219 |
+
<td class="has-text-centered">0.730</td>
|
220 |
+
<td class="has-text-centered">0.644</td>
|
221 |
+
<td class="has-text-centered">0.627</td>
|
222 |
+
<td class="has-text-centered">0.370</td>
|
223 |
+
<td class="has-text-centered">0.848</td>
|
224 |
+
<td class="has-text-centered">0.370</td>
|
225 |
+
<td class="has-text-centered">0.495</td>
|
226 |
+
<td class="has-text-centered">0.623</td>
|
227 |
+
<td class="has-text-centered">0.639</td>
|
228 |
+
<td class="has-text-centered">0.623</td>
|
229 |
+
<td class="has-text-centered">0.605</td>
|
230 |
+
<td class="has-text-centered">0.821</td>
|
231 |
+
<td class="has-text-centered">0.506</td>
|
232 |
+
<td class="has-text-centered">0.867</td>
|
233 |
+
<td class="has-text-centered">0.639</td>
|
234 |
+
<td class="has-text-centered">0.830</td>
|
235 |
+
</tr>
|
236 |
+
<tr>
|
237 |
+
<td>WizardLM-2 8x22B</td>
|
238 |
+
<td class="has-text-centered">0.664</td>
|
239 |
+
<td class="has-text-centered">0.737</td>
|
240 |
+
<td class="has-text-centered">0.664</td>
|
241 |
+
<td class="has-text-centered">0.648</td>
|
242 |
+
<td class="has-text-centered">0.373</td>
|
243 |
+
<td class="has-text-centered">0.842</td>
|
244 |
+
<td class="has-text-centered">0.373</td>
|
245 |
+
<td class="has-text-centered">0.500</td>
|
246 |
+
<td class="has-text-centered">0.583</td>
|
247 |
+
<td class="has-text-centered performance-medium">0.710</td>
|
248 |
+
<td class="has-text-centered">0.583</td>
|
249 |
+
<td class="has-text-centered">0.505</td>
|
250 |
+
<td class="has-text-centered">0.831</td>
|
251 |
+
<td class="has-text-centered">0.630</td>
|
252 |
+
<td class="has-text-centered">0.173</td>
|
253 |
+
<td class="has-text-centered">0.272</td>
|
254 |
+
<td class="has-text-centered">0.797</td>
|
255 |
+
</tr>
|
256 |
+
<tr>
|
257 |
+
<td>DeepSeek-V3</td>
|
258 |
+
<td class="has-text-centered performance-strong">0.722</td>
|
259 |
+
<td class="has-text-centered performance-medium">0.774</td>
|
260 |
+
<td class="has-text-centered performance-strong">0.722</td>
|
261 |
+
<td class="has-text-centered performance-strong">0.714</td>
|
262 |
+
<td class="has-text-centered">0.362</td>
|
263 |
+
<td class="has-text-centered">0.845</td>
|
264 |
+
<td class="has-text-centered">0.362</td>
|
265 |
+
<td class="has-text-centered">0.487</td>
|
266 |
+
<td class="has-text-centered">0.625</td>
|
267 |
+
<td class="has-text-centered performance-strong">0.712</td>
|
268 |
+
<td class="has-text-centered">0.625</td>
|
269 |
+
<td class="has-text-centered">0.578</td>
|
270 |
+
<td class="has-text-centered">0.860</td>
|
271 |
+
<td class="has-text-centered">0.586</td>
|
272 |
+
<td class="has-text-centered">0.796</td>
|
273 |
+
<td class="has-text-centered">0.675</td>
|
274 |
+
<td class="has-text-centered">0.729</td>
|
275 |
+
</tr>
|
276 |
+
<tr>
|
277 |
+
<td>DeepSeek R1</td>
|
278 |
+
<td class="has-text-centered performance-best">0.772</td>
|
279 |
+
<td class="has-text-centered performance-strong">0.789</td>
|
280 |
+
<td class="has-text-centered performance-best">0.772</td>
|
281 |
+
<td class="has-text-centered performance-best">0.763</td>
|
282 |
+
<td class="has-text-centered">0.306</td>
|
283 |
+
<td class="has-text-centered">0.846</td>
|
284 |
+
<td class="has-text-centered">0.306</td>
|
285 |
+
<td class="has-text-centered">0.419</td>
|
286 |
+
<td class="has-text-centered performance-strong">0.679</td>
|
287 |
+
<td class="has-text-centered">0.682</td>
|
288 |
+
<td class="has-text-centered performance-strong">0.679</td>
|
289 |
+
<td class="has-text-centered performance-strong">0.670</td>
|
290 |
+
<td class="has-text-centered">0.851</td>
|
291 |
+
<td class="has-text-centered">0.557</td>
|
292 |
+
<td class="has-text-centered">0.898</td>
|
293 |
+
<td class="has-text-centered">0.688</td>
|
294 |
+
<td class="has-text-centered">0.769</td>
|
295 |
+
</tr>
|
296 |
+
<tr>
|
297 |
+
<td>QwQ-32B-Preview</td>
|
298 |
+
<td class="has-text-centered">0.577</td>
|
299 |
+
<td class="has-text-centered">0.747</td>
|
300 |
+
<td class="has-text-centered">0.577</td>
|
301 |
+
<td class="has-text-centered">0.613</td>
|
302 |
+
<td class="has-text-centered performance-strong">0.716</td>
|
303 |
+
<td class="has-text-centered performance-strong">0.871</td>
|
304 |
+
<td class="has-text-centered performance-strong">0.716</td>
|
305 |
+
<td class="has-text-centered performance-strong">0.784</td>
|
306 |
+
<td class="has-text-centered">0.591</td>
|
307 |
+
<td class="has-text-centered">0.630</td>
|
308 |
+
<td class="has-text-centered">0.591</td>
|
309 |
+
<td class="has-text-centered">0.555</td>
|
310 |
+
<td class="has-text-centered">0.819</td>
|
311 |
+
<td class="has-text-centered performance-best">1.000</td>
|
312 |
+
<td class="has-text-centered">0.010</td>
|
313 |
+
<td class="has-text-centered">0.020</td>
|
314 |
+
<td class="has-text-centered">0.744</td>
|
315 |
+
</tr>
|
316 |
+
<tr>
|
317 |
+
<td>Jamba 1.5 Mini</td>
|
318 |
+
<td class="has-text-centered">0.528</td>
|
319 |
+
<td class="has-text-centered">0.630</td>
|
320 |
+
<td class="has-text-centered">0.528</td>
|
321 |
+
<td class="has-text-centered">0.508</td>
|
322 |
+
<td class="has-text-centered performance-best">0.913</td>
|
323 |
+
<td class="has-text-centered performance-best">0.883</td>
|
324 |
+
<td class="has-text-centered performance-best">0.913</td>
|
325 |
+
<td class="has-text-centered performance-best">0.898</td>
|
326 |
+
<td class="has-text-centered">0.572</td>
|
327 |
+
<td class="has-text-centered">0.678</td>
|
328 |
+
<td class="has-text-centered">0.572</td>
|
329 |
+
<td class="has-text-centered">0.499</td>
|
330 |
+
<td class="has-text-centered">0.812</td>
|
331 |
+
<td class="has-text-centered">0.429</td>
|
332 |
+
<td class="has-text-centered">0.092</td>
|
333 |
+
<td class="has-text-centered">0.151</td>
|
334 |
+
<td class="has-text-centered">0.682</td>
|
335 |
+
</tr>
|
336 |
+
<tr>
|
337 |
+
<td>Jamba 1.5 Large</td>
|
338 |
+
<td class="has-text-centered">0.642</td>
|
339 |
+
<td class="has-text-centered">0.746</td>
|
340 |
+
<td class="has-text-centered">0.642</td>
|
341 |
+
<td class="has-text-centered">0.628</td>
|
342 |
+
<td class="has-text-centered">0.494</td>
|
343 |
+
<td class="has-text-centered">0.851</td>
|
344 |
+
<td class="has-text-centered">0.494</td>
|
345 |
+
<td class="has-text-centered">0.618</td>
|
346 |
+
<td class="has-text-centered">0.597</td>
|
347 |
+
<td class="has-text-centered">0.650</td>
|
348 |
+
<td class="has-text-centered">0.597</td>
|
349 |
+
<td class="has-text-centered">0.550</td>
|
350 |
+
<td class="has-text-centered">0.855</td>
|
351 |
+
<td class="has-text-centered">0.639</td>
|
352 |
+
<td class="has-text-centered">0.469</td>
|
353 |
+
<td class="has-text-centered">0.541</td>
|
354 |
+
<td class="has-text-centered">0.782</td>
|
355 |
+
</tr>
|
356 |
+
<tr>
|
357 |
+
<td>Claude 3.5 Sonnet</td>
|
358 |
+
<td class="has-text-centered">0.682</td>
|
359 |
+
<td class="has-text-centered">0.755</td>
|
360 |
+
<td class="has-text-centered">0.682</td>
|
361 |
+
<td class="has-text-centered">0.668</td>
|
362 |
+
<td class="has-text-centered">0.513</td>
|
363 |
+
<td class="has-text-centered">0.854</td>
|
364 |
+
<td class="has-text-centered">0.513</td>
|
365 |
+
<td class="has-text-centered">0.634</td>
|
366 |
+
<td class="has-text-centered performance-medium">0.675</td>
|
367 |
+
<td class="has-text-centered">0.677</td>
|
368 |
+
<td class="has-text-centered performance-medium">0.675</td>
|
369 |
+
<td class="has-text-centered performance-best">0.674</td>
|
370 |
+
<td class="has-text-centered performance-medium">0.879</td>
|
371 |
+
<td class="has-text-centered">0.646</td>
|
372 |
+
<td class="has-text-centered">0.745</td>
|
373 |
+
<td class="has-text-centered performance-medium">0.692</td>
|
374 |
+
<td class="has-text-centered">0.827</td>
|
375 |
+
</tr>
|
376 |
+
<tr>
|
377 |
+
<td>Claude 3 Haiku</td>
|
378 |
+
<td class="has-text-centered">0.639</td>
|
379 |
+
<td class="has-text-centered">0.735</td>
|
380 |
+
<td class="has-text-centered">0.639</td>
|
381 |
+
<td class="has-text-centered">0.622</td>
|
382 |
+
<td class="has-text-centered">0.067</td>
|
383 |
+
<td class="has-text-centered">0.674</td>
|
384 |
+
<td class="has-text-centered">0.067</td>
|
385 |
+
<td class="has-text-centered">0.022</td>
|
386 |
+
<td class="has-text-centered">0.633</td>
|
387 |
+
<td class="has-text-centered">0.634</td>
|
388 |
+
<td class="has-text-centered">0.633</td>
|
389 |
+
<td class="has-text-centered">0.631</td>
|
390 |
+
<td class="has-text-centered">0.838</td>
|
391 |
+
<td class="has-text-centered">0.556</td>
|
392 |
+
<td class="has-text-centered">0.561</td>
|
393 |
+
<td class="has-text-centered">0.558</td>
|
394 |
+
<td class="has-text-centered">0.781</td>
|
395 |
+
</tr>
|
396 |
+
<tr>
|
397 |
+
<td>Cohere Command R 7B</td>
|
398 |
+
<td class="has-text-centered">0.530</td>
|
399 |
+
<td class="has-text-centered">0.650</td>
|
400 |
+
<td class="has-text-centered">0.530</td>
|
401 |
+
<td class="has-text-centered">0.516</td>
|
402 |
+
<td class="has-text-centered performance-medium">0.682</td>
|
403 |
+
<td class="has-text-centered performance-medium">0.868</td>
|
404 |
+
<td class="has-text-centered performance-medium">0.682</td>
|
405 |
+
<td class="has-text-centered performance-medium">0.762</td>
|
406 |
+
<td class="has-text-centered">0.536</td>
|
407 |
+
<td class="has-text-centered">0.505</td>
|
408 |
+
<td class="has-text-centered">0.536</td>
|
409 |
+
<td class="has-text-centered">0.459</td>
|
410 |
+
<td class="has-text-centered">0.797</td>
|
411 |
+
<td class="has-text-centered">0.210</td>
|
412 |
+
<td class="has-text-centered">0.041</td>
|
413 |
+
<td class="has-text-centered">0.068</td>
|
414 |
+
<td class="has-text-centered">0.770</td>
|
415 |
+
</tr>
|
416 |
+
<tr>
|
417 |
+
<td>Cohere Command R +</td>
|
418 |
+
<td class="has-text-centered">0.660</td>
|
419 |
+
<td class="has-text-centered">0.747</td>
|
420 |
+
<td class="has-text-centered">0.660</td>
|
421 |
+
<td class="has-text-centered">0.651</td>
|
422 |
+
<td class="has-text-centered">0.575</td>
|
423 |
+
<td class="has-text-centered">0.859</td>
|
424 |
+
<td class="has-text-centered">0.575</td>
|
425 |
+
<td class="has-text-centered">0.684</td>
|
426 |
+
<td class="has-text-centered">0.526</td>
|
427 |
+
<td class="has-text-centered">0.655</td>
|
428 |
+
<td class="has-text-centered">0.526</td>
|
429 |
+
<td class="has-text-centered">0.393</td>
|
430 |
+
<td class="has-text-centered">0.804</td>
|
431 |
+
<td class="has-text-centered">0.333</td>
|
432 |
+
<td class="has-text-centered">0.071</td>
|
433 |
+
<td class="has-text-centered">0.118</td>
|
434 |
+
<td class="has-text-centered">0.812</td>
|
435 |
+
</tr>
|
436 |
+
<tr>
|
437 |
+
<td>Google Gemini 1.5 Pro</td>
|
438 |
+
<td class="has-text-centered">0.483</td>
|
439 |
+
<td class="has-text-centered">0.487</td>
|
440 |
+
<td class="has-text-centered">0.483</td>
|
441 |
+
<td class="has-text-centered">0.418</td>
|
442 |
+
<td class="has-text-centered">0.240</td>
|
443 |
+
<td class="has-text-centered">0.823</td>
|
444 |
+
<td class="has-text-centered">0.240</td>
|
445 |
+
<td class="has-text-centered">0.336</td>
|
446 |
+
<td class="has-text-centered">0.619</td>
|
447 |
+
<td class="has-text-centered">0.667</td>
|
448 |
+
<td class="has-text-centered">0.619</td>
|
449 |
+
<td class="has-text-centered">0.579</td>
|
450 |
+
<td class="has-text-centered">0.700</td>
|
451 |
+
<td class="has-text-centered">0.369</td>
|
452 |
+
<td class="has-text-centered">0.908</td>
|
453 |
+
<td class="has-text-centered">0.525</td>
|
454 |
+
<td class="has-text-centered performance-strong">0.837</td>
|
455 |
+
</tr>
|
456 |
+
<tr>
|
457 |
+
<td>OpenAI gpt-4o</td>
|
458 |
+
<td class="has-text-centered performance-medium">0.704</td>
|
459 |
+
<td class="has-text-centered performance-best">0.792</td>
|
460 |
+
<td class="has-text-centered performance-medium">0.704</td>
|
461 |
+
<td class="has-text-centered performance-medium">0.710</td>
|
462 |
+
<td class="has-text-centered">0.396</td>
|
463 |
+
<td class="has-text-centered">0.846</td>
|
464 |
+
<td class="has-text-centered">0.396</td>
|
465 |
+
<td class="has-text-centered">0.524</td>
|
466 |
+
<td class="has-text-centered performance-best">0.681</td>
|
467 |
+
<td class="has-text-centered performance-best">0.719</td>
|
468 |
+
<td class="has-text-centered performance-best">0.681</td>
|
469 |
+
<td class="has-text-centered performance-medium">0.664</td>
|
470 |
+
<td class="has-text-centered performance-best">0.896</td>
|
471 |
+
<td class="has-text-centered performance-medium">0.667</td>
|
472 |
+
<td class="has-text-centered">0.857</td>
|
473 |
+
<td class="has-text-centered performance-best">0.750</td>
|
474 |
+
<td class="has-text-centered">0.824</td>
|
475 |
+
</tr>
|
476 |
+
<tr>
|
477 |
+
<td>OpenAI o1-mini</td>
|
478 |
+
<td class="has-text-centered">0.681</td>
|
479 |
+
<td class="has-text-centered">0.760</td>
|
480 |
+
<td class="has-text-centered">0.681</td>
|
481 |
+
<td class="has-text-centered">0.670</td>
|
482 |
+
<td class="has-text-centered">0.487</td>
|
483 |
+
<td class="has-text-centered">0.851</td>
|
484 |
+
<td class="has-text-centered">0.487</td>
|
485 |
+
<td class="has-text-centered">0.612</td>
|
486 |
+
<td class="has-text-centered">0.651</td>
|
487 |
+
<td class="has-text-centered">0.670</td>
|
488 |
+
<td class="has-text-centered">0.651</td>
|
489 |
+
<td class="has-text-centered">0.635</td>
|
490 |
+
<td class="has-text-centered performance-strong">0.888</td>
|
491 |
+
<td class="has-text-centered performance-medium">0.664</td>
|
492 |
+
<td class="has-text-centered">0.786</td>
|
493 |
+
<td class="has-text-centered performance-strong">0.720</td>
|
494 |
+
<td class="has-text-centered">0.769</td>
|
495 |
+
</tr>
|
496 |
+
</tbody>
|
497 |
+
</table>
|
498 |
+
<div class="content is-small mt-4">
|
499 |
+
<p><strong>Note:</strong> Color highlighting indicates performance ranking:
|
500 |
+
<span class="performance-best"> Best </span>,
|
501 |
+
<span class="performance-medium"> Strong </span>,
|
502 |
+
<span class="performance-low"> Good </span>
|
503 |
+
</p>
|
504 |
+
</div>
|
505 |
+
</div>
|
506 |
+
</div>
|
text_summarization_table.html
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- Text Summarization -->
|
2 |
+
<div id="text-summarization" class="tab-content">
|
3 |
+
<h2 class="title is-4">Text Summarization Task Results</h2>
|
4 |
+
<div class="results-table">
|
5 |
+
<table class="table is-bordered is-striped is-narrow is-hoverable is-fullwidth">
|
6 |
+
<thead>
|
7 |
+
<tr>
|
8 |
+
<th rowspan="2">Model</th>
|
9 |
+
<th colspan="3" class="has-text-centered tooltip-trigger" data-tooltip="Designed for bullet-point summarization of long earnings call transcripts (ECTs) in the financial domain. 2,425 document-summary pairs from publicly traded companies' earnings calls (2019-2022), with concise bullet points extracted from Reuters articles focusing on key financial metrics.">ECTSum</th>
|
10 |
+
<th colspan="3" class="has-text-centered tooltip-trigger" data-tooltip="Financial news summarization dataset with 2,000 financial news articles, each paired with its headline as the ground-truth summary. Manually selected and cleaned to ensure high-quality annotations, providing a benchmark for evaluating LLMs on financial text summarization.">EDTSum</th>
|
11 |
+
</tr>
|
12 |
+
<tr>
|
13 |
+
<th class="has-text-centered">BERTScore Precision</th>
|
14 |
+
<th class="has-text-centered">BERTScore Recall</th>
|
15 |
+
<th class="has-text-centered">BERTScore F1</th>
|
16 |
+
<th class="has-text-centered">BERTScore Precision</th>
|
17 |
+
<th class="has-text-centered">BERTScore Recall</th>
|
18 |
+
<th class="has-text-centered">BERTScore F1</th>
|
19 |
+
</tr>
|
20 |
+
</thead>
|
21 |
+
<tbody>
|
22 |
+
<tr>
|
23 |
+
<td>Llama 3 70B Instruct</td>
|
24 |
+
<td class="has-text-centered">0.715</td>
|
25 |
+
<td class="has-text-centered">0.801</td>
|
26 |
+
<td class="has-text-centered">0.754</td>
|
27 |
+
<td class="has-text-centered">0.793</td>
|
28 |
+
<td class="has-text-centered performance-medium">0.844</td>
|
29 |
+
<td class="has-text-centered performance-strong">0.817</td>
|
30 |
+
</tr>
|
31 |
+
<tr>
|
32 |
+
<td>Llama 3 8B Instruct</td>
|
33 |
+
<td class="has-text-centered">0.724</td>
|
34 |
+
<td class="has-text-centered">0.796</td>
|
35 |
+
<td class="has-text-centered">0.757</td>
|
36 |
+
<td class="has-text-centered">0.785</td>
|
37 |
+
<td class="has-text-centered">0.841</td>
|
38 |
+
<td class="has-text-centered">0.811</td>
|
39 |
+
</tr>
|
40 |
+
<tr>
|
41 |
+
<td>DBRX Instruct</td>
|
42 |
+
<td class="has-text-centered">0.680</td>
|
43 |
+
<td class="has-text-centered">0.786</td>
|
44 |
+
<td class="has-text-centered">0.729</td>
|
45 |
+
<td class="has-text-centered">0.774</td>
|
46 |
+
<td class="has-text-centered">0.843</td>
|
47 |
+
<td class="has-text-centered">0.806</td>
|
48 |
+
</tr>
|
49 |
+
<tr>
|
50 |
+
<td>DeepSeek LLM (67B)</td>
|
51 |
+
<td class="has-text-centered">0.692</td>
|
52 |
+
<td class="has-text-centered">0.678</td>
|
53 |
+
<td class="has-text-centered">0.681</td>
|
54 |
+
<td class="has-text-centered">0.779</td>
|
55 |
+
<td class="has-text-centered">0.840</td>
|
56 |
+
<td class="has-text-centered">0.807</td>
|
57 |
+
</tr>
|
58 |
+
<tr>
|
59 |
+
<td>Gemma 2 27B</td>
|
60 |
+
<td class="has-text-centered">0.680</td>
|
61 |
+
<td class="has-text-centered">0.777</td>
|
62 |
+
<td class="has-text-centered">0.723</td>
|
63 |
+
<td class="has-text-centered performance-strong">0.801</td>
|
64 |
+
<td class="has-text-centered">0.829</td>
|
65 |
+
<td class="has-text-centered">0.814</td>
|
66 |
+
</tr>
|
67 |
+
<tr>
|
68 |
+
<td>Gemma 2 9B</td>
|
69 |
+
<td class="has-text-centered">0.651</td>
|
70 |
+
<td class="has-text-centered">0.531</td>
|
71 |
+
<td class="has-text-centered">0.585</td>
|
72 |
+
<td class="has-text-centered performance-best">0.803</td>
|
73 |
+
<td class="has-text-centered">0.833</td>
|
74 |
+
<td class="has-text-centered performance-strong">0.817</td>
|
75 |
+
</tr>
|
76 |
+
<tr>
|
77 |
+
<td>Mistral (7B) Instruct v0.3</td>
|
78 |
+
<td class="has-text-centered">0.702</td>
|
79 |
+
<td class="has-text-centered performance-strong">0.806</td>
|
80 |
+
<td class="has-text-centered">0.750</td>
|
81 |
+
<td class="has-text-centered">0.783</td>
|
82 |
+
<td class="has-text-centered">0.842</td>
|
83 |
+
<td class="has-text-centered">0.811</td>
|
84 |
+
</tr>
|
85 |
+
<tr>
|
86 |
+
<td>Mixtral-8x22B Instruct</td>
|
87 |
+
<td class="has-text-centered">0.713</td>
|
88 |
+
<td class="has-text-centered performance-best">0.812</td>
|
89 |
+
<td class="has-text-centered">0.758</td>
|
90 |
+
<td class="has-text-centered">0.790</td>
|
91 |
+
<td class="has-text-centered">0.843</td>
|
92 |
+
<td class="has-text-centered">0.815</td>
|
93 |
+
</tr>
|
94 |
+
<tr>
|
95 |
+
<td>Mixtral-8x7B Instruct</td>
|
96 |
+
<td class="has-text-centered">0.727</td>
|
97 |
+
<td class="has-text-centered">0.773</td>
|
98 |
+
<td class="has-text-centered">0.747</td>
|
99 |
+
<td class="has-text-centered">0.785</td>
|
100 |
+
<td class="has-text-centered">0.839</td>
|
101 |
+
<td class="has-text-centered">0.810</td>
|
102 |
+
</tr>
|
103 |
+
<tr>
|
104 |
+
<td>Qwen 2 Instruct (72B)</td>
|
105 |
+
<td class="has-text-centered">0.709</td>
|
106 |
+
<td class="has-text-centered performance-medium">0.804</td>
|
107 |
+
<td class="has-text-centered">0.752</td>
|
108 |
+
<td class="has-text-centered">0.781</td>
|
109 |
+
<td class="has-text-centered performance-strong">0.846</td>
|
110 |
+
<td class="has-text-centered">0.811</td>
|
111 |
+
</tr>
|
112 |
+
<tr>
|
113 |
+
<td>WizardLM-2 8x22B</td>
|
114 |
+
<td class="has-text-centered">0.677</td>
|
115 |
+
<td class="has-text-centered performance-strong">0.806</td>
|
116 |
+
<td class="has-text-centered">0.735</td>
|
117 |
+
<td class="has-text-centered">0.774</td>
|
118 |
+
<td class="has-text-centered performance-best">0.847</td>
|
119 |
+
<td class="has-text-centered">0.808</td>
|
120 |
+
</tr>
|
121 |
+
<tr>
|
122 |
+
<td>DeepSeek-V3</td>
|
123 |
+
<td class="has-text-centered">0.703</td>
|
124 |
+
<td class="has-text-centered performance-strong">0.806</td>
|
125 |
+
<td class="has-text-centered">0.750</td>
|
126 |
+
<td class="has-text-centered">0.791</td>
|
127 |
+
<td class="has-text-centered">0.842</td>
|
128 |
+
<td class="has-text-centered">0.815</td>
|
129 |
+
</tr>
|
130 |
+
<tr>
|
131 |
+
<td>DeepSeek R1</td>
|
132 |
+
<td class="has-text-centered">0.724</td>
|
133 |
+
<td class="has-text-centered">0.800</td>
|
134 |
+
<td class="has-text-centered">0.759</td>
|
135 |
+
<td class="has-text-centered">0.770</td>
|
136 |
+
<td class="has-text-centered">0.843</td>
|
137 |
+
<td class="has-text-centered">0.804</td>
|
138 |
+
</tr>
|
139 |
+
<tr>
|
140 |
+
<td>QwQ-32B-Preview</td>
|
141 |
+
<td class="has-text-centered">0.653</td>
|
142 |
+
<td class="has-text-centered">0.751</td>
|
143 |
+
<td class="has-text-centered">0.696</td>
|
144 |
+
<td class="has-text-centered">0.797</td>
|
145 |
+
<td class="has-text-centered">0.841</td>
|
146 |
+
<td class="has-text-centered performance-strong">0.817</td>
|
147 |
+
</tr>
|
148 |
+
<tr>
|
149 |
+
<td>Jamba 1.5 Mini</td>
|
150 |
+
<td class="has-text-centered">0.692</td>
|
151 |
+
<td class="has-text-centered">0.798</td>
|
152 |
+
<td class="has-text-centered">0.741</td>
|
153 |
+
<td class="has-text-centered">0.798</td>
|
154 |
+
<td class="has-text-centered">0.838</td>
|
155 |
+
<td class="has-text-centered performance-medium">0.816</td>
|
156 |
+
</tr>
|
157 |
+
<tr>
|
158 |
+
<td>Jamba 1.5 Large</td>
|
159 |
+
<td class="has-text-centered">0.679</td>
|
160 |
+
<td class="has-text-centered">0.800</td>
|
161 |
+
<td class="has-text-centered">0.734</td>
|
162 |
+
<td class="has-text-centered">0.799</td>
|
163 |
+
<td class="has-text-centered">0.841</td>
|
164 |
+
<td class="has-text-centered performance-best">0.818</td>
|
165 |
+
</tr>
|
166 |
+
<tr>
|
167 |
+
<td>Claude 3.5 Sonnet</td>
|
168 |
+
<td class="has-text-centered performance-medium">0.737</td>
|
169 |
+
<td class="has-text-centered">0.802</td>
|
170 |
+
<td class="has-text-centered performance-medium">0.767</td>
|
171 |
+
<td class="has-text-centered">0.786</td>
|
172 |
+
<td class="has-text-centered">0.843</td>
|
173 |
+
<td class="has-text-centered">0.813</td>
|
174 |
+
</tr>
|
175 |
+
<tr>
|
176 |
+
<td>Claude 3 Haiku</td>
|
177 |
+
<td class="has-text-centered">0.683</td>
|
178 |
+
<td class="has-text-centered">0.617</td>
|
179 |
+
<td class="has-text-centered">0.646</td>
|
180 |
+
<td class="has-text-centered">0.778</td>
|
181 |
+
<td class="has-text-centered performance-medium">0.844</td>
|
182 |
+
<td class="has-text-centered">0.808</td>
|
183 |
+
</tr>
|
184 |
+
<tr>
|
185 |
+
<td>Cohere Command R 7B</td>
|
186 |
+
<td class="has-text-centered">0.724</td>
|
187 |
+
<td class="has-text-centered">0.781</td>
|
188 |
+
<td class="has-text-centered">0.750</td>
|
189 |
+
<td class="has-text-centered">0.790</td>
|
190 |
+
<td class="has-text-centered performance-medium">0.844</td>
|
191 |
+
<td class="has-text-centered">0.815</td>
|
192 |
+
</tr>
|
193 |
+
<tr>
|
194 |
+
<td>Cohere Command R +</td>
|
195 |
+
<td class="has-text-centered">0.724</td>
|
196 |
+
<td class="has-text-centered">0.782</td>
|
197 |
+
<td class="has-text-centered">0.751</td>
|
198 |
+
<td class="has-text-centered">0.789</td>
|
199 |
+
<td class="has-text-centered">0.834</td>
|
200 |
+
<td class="has-text-centered">0.810</td>
|
201 |
+
</tr>
|
202 |
+
<tr>
|
203 |
+
<td>Google Gemini 1.5 Pro</td>
|
204 |
+
<td class="has-text-centered performance-best">0.757</td>
|
205 |
+
<td class="has-text-centered">0.800</td>
|
206 |
+
<td class="has-text-centered performance-best">0.777</td>
|
207 |
+
<td class="has-text-centered performance-medium">0.800</td>
|
208 |
+
<td class="has-text-centered">0.836</td>
|
209 |
+
<td class="has-text-centered performance-strong">0.817</td>
|
210 |
+
</tr>
|
211 |
+
<tr>
|
212 |
+
<td>OpenAI gpt-4o</td>
|
213 |
+
<td class="has-text-centered performance-strong">0.755</td>
|
214 |
+
<td class="has-text-centered">0.793</td>
|
215 |
+
<td class="has-text-centered performance-strong">0.773</td>
|
216 |
+
<td class="has-text-centered">0.795</td>
|
217 |
+
<td class="has-text-centered">0.840</td>
|
218 |
+
<td class="has-text-centered performance-medium">0.816</td>
|
219 |
+
</tr>
|
220 |
+
<tr>
|
221 |
+
<td>OpenAI o1-mini</td>
|
222 |
+
<td class="has-text-centered">0.731</td>
|
223 |
+
<td class="has-text-centered">0.801</td>
|
224 |
+
<td class="has-text-centered">0.763</td>
|
225 |
+
<td class="has-text-centered">0.795</td>
|
226 |
+
<td class="has-text-centered">0.840</td>
|
227 |
+
<td class="has-text-centered performance-medium">0.816</td>
|
228 |
+
</tr>
|
229 |
+
</tbody>
|
230 |
+
</table>
|
231 |
+
<div class="content is-small mt-4">
|
232 |
+
<p><strong>Note:</strong> Color highlighting indicates performance ranking:
|
233 |
+
<span class="performance-best"> Best </span>,
|
234 |
+
<span class="performance-medium"> Strong </span>,
|
235 |
+
<span class="performance-low"> Good </span>
|
236 |
+
</p>
|
237 |
+
</div>
|
238 |
+
</div>
|
239 |
+
</div>
|