Spaces:
Running
Running
last table updates
Browse files- results.html +361 -263
results.html
CHANGED
@@ -648,11 +648,11 @@
|
|
648 |
<th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FinEntity" data-tooltip="FinEntity consists of 979 financial news paragraphs containing 2,131 manually-annotated financial entities with sentiment classifications. The task involves identifying companies and asset classes in financial texts while determining the associated sentiment expressed toward each entity.">FinEntity</th>
|
649 |
</tr>
|
650 |
<tr>
|
651 |
-
<th class="has-text-centered">Accuracy</th>
|
652 |
<th class="has-text-centered">Precision</th>
|
653 |
<th class="has-text-centered">Recall</th>
|
654 |
<th class="has-text-centered">F1</th>
|
655 |
<th class="has-text-centered">Accuracy</th>
|
|
|
656 |
<th class="has-text-centered">Precision</th>
|
657 |
<th class="has-text-centered">Recall</th>
|
658 |
<th class="has-text-centered">F1</th>
|
@@ -660,13 +660,13 @@
|
|
660 |
<th class="has-text-centered">Precision</th>
|
661 |
<th class="has-text-centered">Recall</th>
|
662 |
<th class="has-text-centered">F1</th>
|
663 |
-
<th class="has-text-centered">Accuracy</th>
|
664 |
<th class="has-text-centered">Precision</th>
|
665 |
<th class="has-text-centered">Recall</th>
|
666 |
<th class="has-text-centered">F1</th>
|
667 |
<th class="has-text-centered">Accuracy</th>
|
668 |
<th class="has-text-centered">Precision</th>
|
669 |
<th class="has-text-centered">Recall</th>
|
|
|
670 |
<th class="has-text-centered">F1</th>
|
671 |
</tr>
|
672 |
</thead>
|
@@ -1365,21 +1365,26 @@
|
|
1365 |
<tr>
|
1366 |
<th rowspan="2">Model</th>
|
1367 |
<th colspan="3" class="has-text-centered tooltip-trigger tooltip-right" data-title="FiQA Task 1" data-tooltip="FiQA Task 1 focuses on aspect-based financial sentiment analysis in microblog posts and news headlines using a continuous scale from -1 (negative) to 1 (positive). The regression task requires models to accurately predict the sentiment score that reflects investor perception of financial texts.">FiQA Task 1</th>
|
1368 |
-
<th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-
|
1369 |
<th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="SubjECTive-QA" data-tooltip="SubjECTive-QA contains 49,446 annotations across 2,747 question-answer pairs extracted from 120 earnings call transcripts. The multi-label classification task involves analyzing six subjective features in financial discourse: assertiveness, cautiousness, optimism, specificity, clarity, and relevance.">SubjECTive-QA</th>
|
|
|
1370 |
</tr>
|
1371 |
<tr>
|
1372 |
<th class="has-text-centered">MSE</th>
|
1373 |
<th class="has-text-centered">MAE</th>
|
1374 |
<th class="has-text-centered">r² Score</th>
|
1375 |
-
<th class="has-text-centered">Accuracy</th>
|
1376 |
<th class="has-text-centered">Precision</th>
|
1377 |
<th class="has-text-centered">Recall</th>
|
|
|
1378 |
<th class="has-text-centered">F1</th>
|
1379 |
<th class="has-text-centered">Precision</th>
|
1380 |
<th class="has-text-centered">Recall</th>
|
1381 |
<th class="has-text-centered">F1</th>
|
1382 |
<th class="has-text-centered">Accuracy</th>
|
|
|
|
|
|
|
|
|
1383 |
</tr>
|
1384 |
</thead>
|
1385 |
<tbody>
|
@@ -1388,323 +1393,416 @@
|
|
1388 |
<td class="has-text-centered">0.123</td>
|
1389 |
<td class="has-text-centered">0.290</td>
|
1390 |
<td class="has-text-centered">0.272</td>
|
1391 |
-
<td class="has-text-centered">0.
|
1392 |
-
<td class="has-text-centered">0.
|
1393 |
-
<td class="has-text-centered">0.
|
1394 |
-
<td class="has-text-centered">0.
|
1395 |
<td class="has-text-centered">0.652</td>
|
1396 |
<td class="has-text-centered">0.573</td>
|
1397 |
<td class="has-text-centered">0.535</td>
|
1398 |
<td class="has-text-centered">0.573</td>
|
1399 |
-
|
1400 |
-
|
|
|
|
|
|
|
|
|
1401 |
<td>Llama 3 8B Instruct</td>
|
1402 |
<td class="has-text-centered">0.161</td>
|
1403 |
<td class="has-text-centered">0.344</td>
|
1404 |
<td class="has-text-centered">0.045</td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1405 |
<td class="has-text-centered">0.738</td>
|
1406 |
<td class="has-text-centered">0.801</td>
|
1407 |
<td class="has-text-centered">0.738</td>
|
1408 |
<td class="has-text-centered">0.698</td>
|
1409 |
-
|
1410 |
-
|
1411 |
-
<td class="has-text-centered performance-best">0.600</td>
|
1412 |
-
<td class="has-text-centered">0.625</td>
|
1413 |
-
</tr>
|
1414 |
-
<tr>
|
1415 |
<td>DBRX Instruct</td>
|
1416 |
<td class="has-text-centered">0.160</td>
|
1417 |
<td class="has-text-centered">0.321</td>
|
1418 |
<td class="has-text-centered">0.052</td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1419 |
<td class="has-text-centered">0.524</td>
|
1420 |
<td class="has-text-centered">0.727</td>
|
1421 |
<td class="has-text-centered">0.524</td>
|
1422 |
<td class="has-text-centered">0.499</td>
|
1423 |
-
|
1424 |
-
|
1425 |
-
<td class="has-text-centered">0.436</td>
|
1426 |
-
<td class="has-text-centered">0.541</td>
|
1427 |
-
</tr>
|
1428 |
-
<tr>
|
1429 |
<td>DeepSeek LLM (67B)</td>
|
1430 |
<td class="has-text-centered">0.118</td>
|
1431 |
<td class="has-text-centered">0.278</td>
|
1432 |
<td class="has-text-centered">0.302</td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1433 |
<td class="has-text-centered">0.815</td>
|
1434 |
<td class="has-text-centered">0.867</td>
|
1435 |
<td class="has-text-centered">0.815</td>
|
1436 |
<td class="has-text-centered">0.811</td>
|
1437 |
-
|
1438 |
-
|
1439 |
-
<td class="has-text-centered">0.462</td>
|
1440 |
-
<td class="has-text-centered">0.544</td>
|
1441 |
-
</tr>
|
1442 |
-
<tr>
|
1443 |
<td>Gemma 2 27B</td>
|
1444 |
<td class="has-text-centered performance-best">0.100</td>
|
1445 |
<td class="has-text-centered performance-best">0.266</td>
|
1446 |
<td class="has-text-centered">0.406</td>
|
1447 |
-
<td class="has-text-centered">0.
|
1448 |
-
<td class="has-text-centered">0.
|
1449 |
-
<td class="has-text-centered">0.
|
1450 |
-
<td class="has-text-centered">0.
|
1451 |
<td class="has-text-centered">0.562</td>
|
1452 |
<td class="has-text-centered">0.524</td>
|
1453 |
<td class="has-text-centered">0.515</td>
|
1454 |
<td class="has-text-centered">0.524</td>
|
1455 |
-
|
1456 |
-
|
|
|
|
|
|
|
|
|
1457 |
<td>Gemma 2 9B</td>
|
1458 |
<td class="has-text-centered">0.189</td>
|
1459 |
<td class="has-text-centered">0.352</td>
|
1460 |
<td class="has-text-centered">-0.120</td>
|
1461 |
-
<td class="has-text-centered
|
1462 |
-
<td class="has-text-centered
|
1463 |
-
<td class="has-text-centered
|
1464 |
-
<td class="has-text-centered
|
1465 |
<td class="has-text-centered">0.570</td>
|
1466 |
<td class="has-text-centered">0.499</td>
|
1467 |
<td class="has-text-centered">0.491</td>
|
1468 |
<td class="has-text-centered">0.499</td>
|
1469 |
-
|
1470 |
-
|
|
|
|
|
|
|
|
|
1471 |
<td>Mistral (7B) Instruct v0.3</td>
|
1472 |
<td class="has-text-centered">0.135</td>
|
1473 |
<td class="has-text-centered">0.278</td>
|
1474 |
<td class="has-text-centered">0.200</td>
|
1475 |
-
<td class="has-text-centered">0.
|
1476 |
-
<td class="has-text-centered">0.
|
1477 |
-
<td class="has-text-centered">0.
|
1478 |
-
<td class="has-text-centered">0.
|
1479 |
<td class="has-text-centered">0.607</td>
|
1480 |
<td class="has-text-centered">0.542</td>
|
1481 |
<td class="has-text-centered">0.522</td>
|
1482 |
<td class="has-text-centered">0.542</td>
|
1483 |
-
|
1484 |
-
|
|
|
|
|
|
|
|
|
1485 |
<td>Mixtral-8x22B Instruct</td>
|
1486 |
<td class="has-text-centered">0.221</td>
|
1487 |
<td class="has-text-centered">0.364</td>
|
1488 |
-
<td class="has-text-centered">-0.310</td>
|
1489 |
-
<td class="has-text-centered">0.
|
1490 |
-
<td class="has-text-centered">0.
|
1491 |
-
<td class="has-text-centered">0.
|
1492 |
-
<td class="has-text-centered">0.
|
1493 |
<td class="has-text-centered">0.614</td>
|
1494 |
<td class="has-text-centered">0.538</td>
|
1495 |
<td class="has-text-centered">0.510</td>
|
1496 |
<td class="has-text-centered">0.538</td>
|
1497 |
-
|
1498 |
-
|
1499 |
-
<td>
|
1500 |
-
<td class="has-text-centered">0.
|
1501 |
-
|
1502 |
-
|
1503 |
-
|
1504 |
-
|
1505 |
-
|
1506 |
-
|
1507 |
-
|
1508 |
-
|
1509 |
-
|
1510 |
-
|
1511 |
-
</
|
1512 |
-
<
|
1513 |
-
|
1514 |
-
|
1515 |
-
|
1516 |
-
|
1517 |
-
|
1518 |
-
|
1519 |
-
|
1520 |
-
|
1521 |
-
|
1522 |
-
|
1523 |
-
|
1524 |
-
|
1525 |
-
</
|
1526 |
-
<
|
1527 |
-
|
1528 |
-
|
1529 |
-
|
1530 |
-
|
1531 |
-
|
1532 |
-
|
1533 |
-
|
1534 |
-
|
1535 |
-
|
1536 |
-
|
1537 |
-
|
1538 |
-
|
1539 |
-
</
|
1540 |
-
<
|
1541 |
-
|
1542 |
-
|
1543 |
-
|
1544 |
-
|
1545 |
-
|
1546 |
-
|
1547 |
-
|
1548 |
-
|
1549 |
-
|
1550 |
-
|
1551 |
-
|
1552 |
-
|
1553 |
-
</
|
1554 |
-
<
|
1555 |
-
|
1556 |
-
|
1557 |
-
|
1558 |
-
|
1559 |
-
|
1560 |
-
|
1561 |
-
|
1562 |
-
|
1563 |
-
|
1564 |
-
|
1565 |
-
|
1566 |
-
|
1567 |
-
</
|
1568 |
-
<
|
1569 |
-
|
1570 |
-
|
1571 |
-
|
1572 |
-
|
1573 |
-
|
1574 |
-
|
1575 |
-
|
1576 |
-
|
1577 |
-
|
1578 |
-
|
1579 |
-
|
1580 |
-
|
1581 |
-
</
|
1582 |
-
<
|
1583 |
-
|
1584 |
-
|
1585 |
-
|
1586 |
-
|
1587 |
-
|
1588 |
-
|
1589 |
-
|
1590 |
-
|
1591 |
-
|
1592 |
-
|
1593 |
-
|
1594 |
-
|
1595 |
-
</
|
1596 |
-
<
|
1597 |
-
|
1598 |
-
|
1599 |
-
|
1600 |
-
|
1601 |
-
|
1602 |
-
|
1603 |
-
|
1604 |
-
|
1605 |
-
|
1606 |
-
|
1607 |
-
|
1608 |
-
|
1609 |
-
|
1610 |
-
|
1611 |
-
|
1612 |
-
|
1613 |
-
|
1614 |
-
|
1615 |
-
|
1616 |
-
|
1617 |
-
|
1618 |
-
|
1619 |
-
|
1620 |
-
|
1621 |
-
|
1622 |
-
|
1623 |
-
|
1624 |
-
|
1625 |
-
|
1626 |
-
|
1627 |
-
|
1628 |
-
|
1629 |
-
|
1630 |
-
|
1631 |
-
|
1632 |
-
|
1633 |
-
|
1634 |
-
|
1635 |
-
|
1636 |
-
|
1637 |
-
|
1638 |
-
|
1639 |
-
|
1640 |
-
|
1641 |
-
|
1642 |
-
|
1643 |
-
|
1644 |
-
|
1645 |
-
|
1646 |
-
|
1647 |
-
|
1648 |
-
|
1649 |
-
|
1650 |
-
|
1651 |
-
|
1652 |
-
|
1653 |
-
|
1654 |
-
|
1655 |
-
|
1656 |
-
|
1657 |
-
|
1658 |
-
|
1659 |
-
|
1660 |
-
|
1661 |
-
|
1662 |
-
|
1663 |
-
|
1664 |
-
|
1665 |
-
|
1666 |
-
|
1667 |
-
|
1668 |
-
|
1669 |
-
|
1670 |
-
|
1671 |
-
|
1672 |
-
|
1673 |
-
|
1674 |
-
|
1675 |
-
|
1676 |
-
|
1677 |
-
|
1678 |
-
|
1679 |
-
|
1680 |
-
|
1681 |
-
|
1682 |
-
|
1683 |
-
|
1684 |
-
|
1685 |
-
|
1686 |
-
|
1687 |
-
|
1688 |
-
|
1689 |
-
|
1690 |
-
|
1691 |
-
|
1692 |
-
|
1693 |
-
|
1694 |
-
|
1695 |
-
|
1696 |
-
|
1697 |
-
|
1698 |
-
|
1699 |
-
|
1700 |
-
|
1701 |
-
|
1702 |
-
|
1703 |
-
|
1704 |
-
|
1705 |
-
|
1706 |
-
|
1707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1708 |
</tbody>
|
1709 |
</table>
|
1710 |
<div class="content is-small mt-4">
|
@@ -1744,9 +1842,9 @@
|
|
1744 |
<th class="has-text-centered">Precision</th>
|
1745 |
<th class="has-text-centered">Recall</th>
|
1746 |
<th class="has-text-centered">F1</th>
|
1747 |
-
<th class="has-text-centered">Accuracy</th>
|
1748 |
<th class="has-text-centered">Precision</th>
|
1749 |
<th class="has-text-centered">Recall</th>
|
|
|
1750 |
<th class="has-text-centered">F1</th>
|
1751 |
<th class="has-text-centered">Accuracy</th>
|
1752 |
</tr>
|
|
|
648 |
<th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="FinEntity" data-tooltip="FinEntity consists of 979 financial news paragraphs containing 2,131 manually-annotated financial entities with sentiment classifications. The task involves identifying companies and asset classes in financial texts while determining the associated sentiment expressed toward each entity.">FinEntity</th>
|
649 |
</tr>
|
650 |
<tr>
|
|
|
651 |
<th class="has-text-centered">Precision</th>
|
652 |
<th class="has-text-centered">Recall</th>
|
653 |
<th class="has-text-centered">F1</th>
|
654 |
<th class="has-text-centered">Accuracy</th>
|
655 |
+
<th class="has-text-centered">Accuracy</th>
|
656 |
<th class="has-text-centered">Precision</th>
|
657 |
<th class="has-text-centered">Recall</th>
|
658 |
<th class="has-text-centered">F1</th>
|
|
|
660 |
<th class="has-text-centered">Precision</th>
|
661 |
<th class="has-text-centered">Recall</th>
|
662 |
<th class="has-text-centered">F1</th>
|
|
|
663 |
<th class="has-text-centered">Precision</th>
|
664 |
<th class="has-text-centered">Recall</th>
|
665 |
<th class="has-text-centered">F1</th>
|
666 |
<th class="has-text-centered">Accuracy</th>
|
667 |
<th class="has-text-centered">Precision</th>
|
668 |
<th class="has-text-centered">Recall</th>
|
669 |
+
<th class="has-text-centered">Accuracy</th>
|
670 |
<th class="has-text-centered">F1</th>
|
671 |
</tr>
|
672 |
</thead>
|
|
|
1365 |
<tr>
|
1366 |
<th rowspan="2">Model</th>
|
1367 |
<th colspan="3" class="has-text-centered tooltip-trigger tooltip-right" data-title="FiQA Task 1" data-tooltip="FiQA Task 1 focuses on aspect-based financial sentiment analysis in microblog posts and news headlines using a continuous scale from -1 (negative) to 1 (positive). The regression task requires models to accurately predict the sentiment score that reflects investor perception of financial texts.">FiQA Task 1</th>
|
1368 |
+
<th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-table="FinEntity" data-tooltip="FinEntity contains 1,000 financial news articles annotated with sentiment labels for 10 financial entities. The multi-label classification task involves predicting the sentiment of each entity in the news article, requiring models to understand the nuanced sentiment expressed towards different financial entities.">FinEntity</th>
|
1369 |
<th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="SubjECTive-QA" data-tooltip="SubjECTive-QA contains 49,446 annotations across 2,747 question-answer pairs extracted from 120 earnings call transcripts. The multi-label classification task involves analyzing six subjective features in financial discourse: assertiveness, cautiousness, optimism, specificity, clarity, and relevance.">SubjECTive-QA</th>
|
1370 |
+
<th colspan="4" class="has-text-centered tooltip-trigger tooltip-right" data-title="Financial Phrase Bank" data-tooltip="Financial Phrase Bank (FPB) contains 4,840 sentences from financial news articles categorized as positive, negative, or neutral by 16 finance experts using majority voting. The sentiment classification task requires understanding how these statements might influence investor perception of stock prices.">Financial Phrase Bank (FPB)</th>
|
1371 |
</tr>
|
1372 |
<tr>
|
1373 |
<th class="has-text-centered">MSE</th>
|
1374 |
<th class="has-text-centered">MAE</th>
|
1375 |
<th class="has-text-centered">r² Score</th>
|
|
|
1376 |
<th class="has-text-centered">Precision</th>
|
1377 |
<th class="has-text-centered">Recall</th>
|
1378 |
+
<th class="has-text-centered">Accuracy</th>
|
1379 |
<th class="has-text-centered">F1</th>
|
1380 |
<th class="has-text-centered">Precision</th>
|
1381 |
<th class="has-text-centered">Recall</th>
|
1382 |
<th class="has-text-centered">F1</th>
|
1383 |
<th class="has-text-centered">Accuracy</th>
|
1384 |
+
<th class="has-text-centered">Accuracy</th>
|
1385 |
+
<th class="has-text-centered">Precision</th>
|
1386 |
+
<th class="has-text-centered">Recall</th>
|
1387 |
+
<th class="has-text-centered">F1</th>
|
1388 |
</tr>
|
1389 |
</thead>
|
1390 |
<tbody>
|
|
|
1393 |
<td class="has-text-centered">0.123</td>
|
1394 |
<td class="has-text-centered">0.290</td>
|
1395 |
<td class="has-text-centered">0.272</td>
|
1396 |
+
<td class="has-text-centered">0.474</td>
|
1397 |
+
<td class="has-text-centered">0.485</td>
|
1398 |
+
<td class="has-text-centered">0.485</td>
|
1399 |
+
<td class="has-text-centered">0.469</td>
|
1400 |
<td class="has-text-centered">0.652</td>
|
1401 |
<td class="has-text-centered">0.573</td>
|
1402 |
<td class="has-text-centered">0.535</td>
|
1403 |
<td class="has-text-centered">0.573</td>
|
1404 |
+
<td class="has-text-centered">0.901</td>
|
1405 |
+
<td class="has-text-centered">0.904</td>
|
1406 |
+
<td class="has-text-centered">0.901</td>
|
1407 |
+
<td class="has-text-centered">0.902</td>
|
1408 |
+
</tr>
|
1409 |
+
<tr>
|
1410 |
<td>Llama 3 8B Instruct</td>
|
1411 |
<td class="has-text-centered">0.161</td>
|
1412 |
<td class="has-text-centered">0.344</td>
|
1413 |
<td class="has-text-centered">0.045</td>
|
1414 |
+
<td class="has-text-centered">0.301</td>
|
1415 |
+
<td class="has-text-centered">0.478</td>
|
1416 |
+
<td class="has-text-centered">0.478</td>
|
1417 |
+
<td class="has-text-centered">0.350</td>
|
1418 |
+
<td class="has-text-centered">0.635</td>
|
1419 |
+
<td class="has-text-centered performance-best">0.625</td>
|
1420 |
+
<td class="has-text-centered performance-best">0.600</td>
|
1421 |
+
<td class="has-text-centered performance-best">0.625</td>
|
1422 |
<td class="has-text-centered">0.738</td>
|
1423 |
<td class="has-text-centered">0.801</td>
|
1424 |
<td class="has-text-centered">0.738</td>
|
1425 |
<td class="has-text-centered">0.698</td>
|
1426 |
+
</tr>
|
1427 |
+
<tr>
|
|
|
|
|
|
|
|
|
1428 |
<td>DBRX Instruct</td>
|
1429 |
<td class="has-text-centered">0.160</td>
|
1430 |
<td class="has-text-centered">0.321</td>
|
1431 |
<td class="has-text-centered">0.052</td>
|
1432 |
+
<td class="has-text-centered">0.004</td>
|
1433 |
+
<td class="has-text-centered">0.014</td>
|
1434 |
+
<td class="has-text-centered">0.014</td>
|
1435 |
+
<td class="has-text-centered">0.006</td>
|
1436 |
+
<td class="has-text-centered performance-low">0.654</td>
|
1437 |
+
<td class="has-text-centered">0.541</td>
|
1438 |
+
<td class="has-text-centered">0.436</td>
|
1439 |
+
<td class="has-text-centered">0.541</td>
|
1440 |
<td class="has-text-centered">0.524</td>
|
1441 |
<td class="has-text-centered">0.727</td>
|
1442 |
<td class="has-text-centered">0.524</td>
|
1443 |
<td class="has-text-centered">0.499</td>
|
1444 |
+
</tr>
|
1445 |
+
<tr>
|
|
|
|
|
|
|
|
|
1446 |
<td>DeepSeek LLM (67B)</td>
|
1447 |
<td class="has-text-centered">0.118</td>
|
1448 |
<td class="has-text-centered">0.278</td>
|
1449 |
<td class="has-text-centered">0.302</td>
|
1450 |
+
<td class="has-text-centered">0.456</td>
|
1451 |
+
<td class="has-text-centered">0.405</td>
|
1452 |
+
<td class="has-text-centered">0.405</td>
|
1453 |
+
<td class="has-text-centered">0.416</td>
|
1454 |
+
<td class="has-text-centered performance-best">0.676</td>
|
1455 |
+
<td class="has-text-centered">0.544</td>
|
1456 |
+
<td class="has-text-centered">0.462</td>
|
1457 |
+
<td class="has-text-centered">0.544</td>
|
1458 |
<td class="has-text-centered">0.815</td>
|
1459 |
<td class="has-text-centered">0.867</td>
|
1460 |
<td class="has-text-centered">0.815</td>
|
1461 |
<td class="has-text-centered">0.811</td>
|
1462 |
+
</tr>
|
1463 |
+
<tr>
|
|
|
|
|
|
|
|
|
1464 |
<td>Gemma 2 27B</td>
|
1465 |
<td class="has-text-centered performance-best">0.100</td>
|
1466 |
<td class="has-text-centered performance-best">0.266</td>
|
1467 |
<td class="has-text-centered">0.406</td>
|
1468 |
+
<td class="has-text-centered">0.320</td>
|
1469 |
+
<td class="has-text-centered">0.295</td>
|
1470 |
+
<td class="has-text-centered">0.295</td>
|
1471 |
+
<td class="has-text-centered">0.298</td>
|
1472 |
<td class="has-text-centered">0.562</td>
|
1473 |
<td class="has-text-centered">0.524</td>
|
1474 |
<td class="has-text-centered">0.515</td>
|
1475 |
<td class="has-text-centered">0.524</td>
|
1476 |
+
<td class="has-text-centered">0.890</td>
|
1477 |
+
<td class="has-text-centered">0.896</td>
|
1478 |
+
<td class="has-text-centered">0.890</td>
|
1479 |
+
<td class="has-text-centered">0.884</td>
|
1480 |
+
</tr>
|
1481 |
+
<tr>
|
1482 |
<td>Gemma 2 9B</td>
|
1483 |
<td class="has-text-centered">0.189</td>
|
1484 |
<td class="has-text-centered">0.352</td>
|
1485 |
<td class="has-text-centered">-0.120</td>
|
1486 |
+
<td class="has-text-centered">0.348</td>
|
1487 |
+
<td class="has-text-centered">0.419</td>
|
1488 |
+
<td class="has-text-centered">0.419</td>
|
1489 |
+
<td class="has-text-centered">0.367</td>
|
1490 |
<td class="has-text-centered">0.570</td>
|
1491 |
<td class="has-text-centered">0.499</td>
|
1492 |
<td class="has-text-centered">0.491</td>
|
1493 |
<td class="has-text-centered">0.499</td>
|
1494 |
+
<td class="has-text-centered performance-medium">0.940</td>
|
1495 |
+
<td class="has-text-centered performance-medium">0.941</td>
|
1496 |
+
<td class="has-text-centered performance-medium">0.940</td>
|
1497 |
+
<td class="has-text-centered performance-medium">0.940</td>
|
1498 |
+
</tr>
|
1499 |
+
<tr>
|
1500 |
<td>Mistral (7B) Instruct v0.3</td>
|
1501 |
<td class="has-text-centered">0.135</td>
|
1502 |
<td class="has-text-centered">0.278</td>
|
1503 |
<td class="has-text-centered">0.200</td>
|
1504 |
+
<td class="has-text-centered">0.337</td>
|
1505 |
+
<td class="has-text-centered">0.477</td>
|
1506 |
+
<td class="has-text-centered">0.477</td>
|
1507 |
+
<td class="has-text-centered">0.368</td>
|
1508 |
<td class="has-text-centered">0.607</td>
|
1509 |
<td class="has-text-centered">0.542</td>
|
1510 |
<td class="has-text-centered">0.522</td>
|
1511 |
<td class="has-text-centered">0.542</td>
|
1512 |
+
<td class="has-text-centered">0.847</td>
|
1513 |
+
<td class="has-text-centered">0.854</td>
|
1514 |
+
<td class="has-text-centered">0.847</td>
|
1515 |
+
<td class="has-text-centered">0.841</td>
|
1516 |
+
</tr>
|
1517 |
+
<tr>
|
1518 |
<td>Mixtral-8x22B Instruct</td>
|
1519 |
<td class="has-text-centered">0.221</td>
|
1520 |
<td class="has-text-centered">0.364</td>
|
1521 |
+
<td class="has-text-centered performance-best">-0.310</td>
|
1522 |
+
<td class="has-text-centered">0.428</td>
|
1523 |
+
<td class="has-text-centered">0.481</td>
|
1524 |
+
<td class="has-text-centered">0.481</td>
|
1525 |
+
<td class="has-text-centered">0.435</td>
|
1526 |
<td class="has-text-centered">0.614</td>
|
1527 |
<td class="has-text-centered">0.538</td>
|
1528 |
<td class="has-text-centered">0.510</td>
|
1529 |
<td class="has-text-centered">0.538</td>
|
1530 |
+
<td class="has-text-centered">0.768</td>
|
1531 |
+
<td class="has-text-centered">0.845</td>
|
1532 |
+
<td class="has-text-centered">0.768</td>
|
1533 |
+
<td class="has-text-centered">0.776</td>
|
1534 |
+
</tr>
|
1535 |
+
<tr>
|
1536 |
+
<td>Mixtral-8x7B Instruct</td>
|
1537 |
+
<td class="has-text-centered">0.208</td>
|
1538 |
+
<td class="has-text-centered">0.307</td>
|
1539 |
+
<td class="has-text-centered performance-medium">-0.229</td>
|
1540 |
+
<td class="has-text-centered">0.251</td>
|
1541 |
+
<td class="has-text-centered">0.324</td>
|
1542 |
+
<td class="has-text-centered">0.324</td>
|
1543 |
+
<td class="has-text-centered">0.267</td>
|
1544 |
+
<td class="has-text-centered">0.611</td>
|
1545 |
+
<td class="has-text-centered">0.518</td>
|
1546 |
+
<td class="has-text-centered">0.498</td>
|
1547 |
+
<td class="has-text-centered">0.518</td>
|
1548 |
+
<td class="has-text-centered">0.896</td>
|
1549 |
+
<td class="has-text-centered">0.898</td>
|
1550 |
+
<td class="has-text-centered">0.896</td>
|
1551 |
+
<td class="has-text-centered">0.893</td>
|
1552 |
+
</tr>
|
1553 |
+
<tr>
|
1554 |
+
<td>Qwen 2 Instruct (72B)</td>
|
1555 |
+
<td class="has-text-centered">0.205</td>
|
1556 |
+
<td class="has-text-centered">0.409</td>
|
1557 |
+
<td class="has-text-centered performance-low">-0.212</td>
|
1558 |
+
<td class="has-text-centered">0.468</td>
|
1559 |
+
<td class="has-text-centered">0.530</td>
|
1560 |
+
<td class="has-text-centered">0.530</td>
|
1561 |
+
<td class="has-text-centered">0.483</td>
|
1562 |
+
<td class="has-text-centered">0.644</td>
|
1563 |
+
<td class="has-text-centered performance-medium">0.601</td>
|
1564 |
+
<td class="has-text-centered">0.576</td>
|
1565 |
+
<td class="has-text-centered performance-medium">0.601</td>
|
1566 |
+
<td class="has-text-centered">0.904</td>
|
1567 |
+
<td class="has-text-centered">0.908</td>
|
1568 |
+
<td class="has-text-centered">0.904</td>
|
1569 |
+
<td class="has-text-centered">0.901</td>
|
1570 |
+
</tr>
|
1571 |
+
<tr>
|
1572 |
+
<td>WizardLM-2 8x22B</td>
|
1573 |
+
<td class="has-text-centered">0.129</td>
|
1574 |
+
<td class="has-text-centered">0.283</td>
|
1575 |
+
<td class="has-text-centered">0.239</td>
|
1576 |
+
<td class="has-text-centered">0.222</td>
|
1577 |
+
<td class="has-text-centered">0.247</td>
|
1578 |
+
<td class="has-text-centered">0.247</td>
|
1579 |
+
<td class="has-text-centered">0.226</td>
|
1580 |
+
<td class="has-text-centered">0.611</td>
|
1581 |
+
<td class="has-text-centered">0.570</td>
|
1582 |
+
<td class="has-text-centered">0.566</td>
|
1583 |
+
<td class="has-text-centered">0.570</td>
|
1584 |
+
<td class="has-text-centered">0.765</td>
|
1585 |
+
<td class="has-text-centered">0.853</td>
|
1586 |
+
<td class="has-text-centered">0.765</td>
|
1587 |
+
<td class="has-text-centered">0.779</td>
|
1588 |
+
</tr>
|
1589 |
+
<tr>
|
1590 |
+
<td>DeepSeek-V3</td>
|
1591 |
+
<td class="has-text-centered">0.150</td>
|
1592 |
+
<td class="has-text-centered">0.311</td>
|
1593 |
+
<td class="has-text-centered">0.111</td>
|
1594 |
+
<td class="has-text-centered">0.563</td>
|
1595 |
+
<td class="has-text-centered">0.544</td>
|
1596 |
+
<td class="has-text-centered">0.544</td>
|
1597 |
+
<td class="has-text-centered">0.549</td>
|
1598 |
+
<td class="has-text-centered">0.640</td>
|
1599 |
+
<td class="has-text-centered">0.572</td>
|
1600 |
+
<td class="has-text-centered performance-low">0.583</td>
|
1601 |
+
<td class="has-text-centered">0.572</td>
|
1602 |
+
<td class="has-text-centered">0.828</td>
|
1603 |
+
<td class="has-text-centered">0.851</td>
|
1604 |
+
<td class="has-text-centered">0.828</td>
|
1605 |
+
<td class="has-text-centered">0.814</td>
|
1606 |
+
</tr>
|
1607 |
+
<tr>
|
1608 |
+
<td>DeepSeek R1</td>
|
1609 |
+
<td class="has-text-centered">0.110</td>
|
1610 |
+
<td class="has-text-centered">0.289</td>
|
1611 |
+
<td class="has-text-centered">0.348</td>
|
1612 |
+
<td class="has-text-centered performance-low">0.600</td>
|
1613 |
+
<td class="has-text-centered performance-low">0.586</td>
|
1614 |
+
<td class="has-text-centered performance-low">0.586</td>
|
1615 |
+
<td class="has-text-centered performance-low">0.587</td>
|
1616 |
+
<td class="has-text-centered">0.644</td>
|
1617 |
+
<td class="has-text-centered">0.489</td>
|
1618 |
+
<td class="has-text-centered">0.499</td>
|
1619 |
+
<td class="has-text-centered">0.489</td>
|
1620 |
+
<td class="has-text-centered">0.904</td>
|
1621 |
+
<td class="has-text-centered">0.907</td>
|
1622 |
+
<td class="has-text-centered">0.904</td>
|
1623 |
+
<td class="has-text-centered">0.902</td>
|
1624 |
+
</tr>
|
1625 |
+
<tr>
|
1626 |
+
<td>QwQ-32B-Preview</td>
|
1627 |
+
<td class="has-text-centered">0.141</td>
|
1628 |
+
<td class="has-text-centered">0.290</td>
|
1629 |
+
<td class="has-text-centered">0.165</td>
|
1630 |
+
<td class="has-text-centered">0.005</td>
|
1631 |
+
<td class="has-text-centered">0.005</td>
|
1632 |
+
<td class="has-text-centered">0.005</td>
|
1633 |
+
<td class="has-text-centered">0.005</td>
|
1634 |
+
<td class="has-text-centered">0.629</td>
|
1635 |
+
<td class="has-text-centered">0.534</td>
|
1636 |
+
<td class="has-text-centered">0.550</td>
|
1637 |
+
<td class="has-text-centered">0.534</td>
|
1638 |
+
<td class="has-text-centered">0.812</td>
|
1639 |
+
<td class="has-text-centered">0.827</td>
|
1640 |
+
<td class="has-text-centered">0.812</td>
|
1641 |
+
<td class="has-text-centered">0.815</td>
|
1642 |
+
</tr>
|
1643 |
+
<tr>
|
1644 |
+
<td>Jamba 1.5 Mini</td>
|
1645 |
+
<td class="has-text-centered">0.119</td>
|
1646 |
+
<td class="has-text-centered">0.282</td>
|
1647 |
+
<td class="has-text-centered">0.293</td>
|
1648 |
+
<td class="has-text-centered">0.119</td>
|
1649 |
+
<td class="has-text-centered">0.182</td>
|
1650 |
+
<td class="has-text-centered">0.182</td>
|
1651 |
+
<td class="has-text-centered">0.132</td>
|
1652 |
+
<td class="has-text-centered">0.380</td>
|
1653 |
+
<td class="has-text-centered">0.525</td>
|
1654 |
+
<td class="has-text-centered">0.418</td>
|
1655 |
+
<td class="has-text-centered">0.525</td>
|
1656 |
+
<td class="has-text-centered">0.784</td>
|
1657 |
+
<td class="has-text-centered">0.814</td>
|
1658 |
+
<td class="has-text-centered">0.784</td>
|
1659 |
+
<td class="has-text-centered">0.765</td>
|
1660 |
+
</tr>
|
1661 |
+
<tr>
|
1662 |
+
<td>Jamba 1.5 Large</td>
|
1663 |
+
<td class="has-text-centered">0.183</td>
|
1664 |
+
<td class="has-text-centered">0.363</td>
|
1665 |
+
<td class="has-text-centered">-0.085</td>
|
1666 |
+
<td class="has-text-centered">0.403</td>
|
1667 |
+
<td class="has-text-centered">0.414</td>
|
1668 |
+
<td class="has-text-centered">0.414</td>
|
1669 |
+
<td class="has-text-centered">0.397</td>
|
1670 |
+
<td class="has-text-centered">0.635</td>
|
1671 |
+
<td class="has-text-centered">0.573</td>
|
1672 |
+
<td class="has-text-centered">0.582</td>
|
1673 |
+
<td class="has-text-centered">0.573</td>
|
1674 |
+
<td class="has-text-centered">0.824</td>
|
1675 |
+
<td class="has-text-centered">0.850</td>
|
1676 |
+
<td class="has-text-centered">0.824</td>
|
1677 |
+
<td class="has-text-centered">0.798</td>
|
1678 |
+
</tr>
|
1679 |
+
<tr>
|
1680 |
+
<td>Claude 3.5 Sonnet</td>
|
1681 |
+
<td class="has-text-centered performance-medium">0.101</td>
|
1682 |
+
<td class="has-text-centered performance-medium">0.268</td>
|
1683 |
+
<td class="has-text-centered">0.402</td>
|
1684 |
+
<td class="has-text-centered performance-medium">0.658</td>
|
1685 |
+
<td class="has-text-centered performance-medium">0.668</td>
|
1686 |
+
<td class="has-text-centered performance-medium">0.668</td>
|
1687 |
+
<td class="has-text-centered performance-medium">0.655</td>
|
1688 |
+
<td class="has-text-centered">0.634</td>
|
1689 |
+
<td class="has-text-centered">0.585</td>
|
1690 |
+
<td class="has-text-centered">0.553</td>
|
1691 |
+
<td class="has-text-centered">0.585</td>
|
1692 |
+
<td class="has-text-centered performance-best">0.944</td>
|
1693 |
+
<td class="has-text-centered performance-best">0.945</td>
|
1694 |
+
<td class="has-text-centered performance-best">0.944</td>
|
1695 |
+
<td class="has-text-centered performance-best">0.944</td>
|
1696 |
+
</tr>
|
1697 |
+
<tr>
|
1698 |
+
<td>Claude 3 Haiku</td>
|
1699 |
+
<td class="has-text-centered">0.167</td>
|
1700 |
+
<td class="has-text-centered">0.349</td>
|
1701 |
+
<td class="has-text-centered">0.008</td>
|
1702 |
+
<td class="has-text-centered">0.498</td>
|
1703 |
+
<td class="has-text-centered">0.517</td>
|
1704 |
+
<td class="has-text-centered">0.517</td>
|
1705 |
+
<td class="has-text-centered">0.494</td>
|
1706 |
+
<td class="has-text-centered">0.619</td>
|
1707 |
+
<td class="has-text-centered">0.538</td>
|
1708 |
+
<td class="has-text-centered">0.463</td>
|
1709 |
+
<td class="has-text-centered">0.538</td>
|
1710 |
+
<td class="has-text-centered">0.907</td>
|
1711 |
+
<td class="has-text-centered">0.913</td>
|
1712 |
+
<td class="has-text-centered">0.907</td>
|
1713 |
+
<td class="has-text-centered">0.908</td>
|
1714 |
+
</tr>
|
1715 |
+
<tr>
|
1716 |
+
<td>Cohere Command R 7B</td>
|
1717 |
+
<td class="has-text-centered">0.164</td>
|
1718 |
+
<td class="has-text-centered">0.319</td>
|
1719 |
+
<td class="has-text-centered">0.028</td>
|
1720 |
+
<td class="has-text-centered">0.457</td>
|
1721 |
+
<td class="has-text-centered">0.446</td>
|
1722 |
+
<td class="has-text-centered">0.446</td>
|
1723 |
+
<td class="has-text-centered">0.441</td>
|
1724 |
+
<td class="has-text-centered">0.609</td>
|
1725 |
+
<td class="has-text-centered">0.547</td>
|
1726 |
+
<td class="has-text-centered">0.532</td>
|
1727 |
+
<td class="has-text-centered">0.547</td>
|
1728 |
+
<td class="has-text-centered">0.835</td>
|
1729 |
+
<td class="has-text-centered">0.861</td>
|
1730 |
+
<td class="has-text-centered">0.835</td>
|
1731 |
+
<td class="has-text-centered">0.840</td>
|
1732 |
+
</tr>
|
1733 |
+
<tr>
|
1734 |
+
<td>Cohere Command R +</td>
|
1735 |
+
<td class="has-text-centered performance-low">0.106</td>
|
1736 |
+
<td class="has-text-centered performance-low">0.274</td>
|
1737 |
+
<td class="has-text-centered">0.373</td>
|
1738 |
+
<td class="has-text-centered">0.462</td>
|
1739 |
+
<td class="has-text-centered">0.459</td>
|
1740 |
+
<td class="has-text-centered">0.459</td>
|
1741 |
+
<td class="has-text-centered">0.452</td>
|
1742 |
+
<td class="has-text-centered">0.608</td>
|
1743 |
+
<td class="has-text-centered">0.547</td>
|
1744 |
+
<td class="has-text-centered">0.533</td>
|
1745 |
+
<td class="has-text-centered">0.547</td>
|
1746 |
+
<td class="has-text-centered">0.741</td>
|
1747 |
+
<td class="has-text-centered">0.806</td>
|
1748 |
+
<td class="has-text-centered">0.741</td>
|
1749 |
+
<td class="has-text-centered">0.699</td>
|
1750 |
+
</tr>
|
1751 |
+
<tr>
|
1752 |
+
<td>Google Gemini 1.5 Pro</td>
|
1753 |
+
<td class="has-text-centered">0.144</td>
|
1754 |
+
<td class="has-text-centered">0.329</td>
|
1755 |
+
<td class="has-text-centered">0.149</td>
|
1756 |
+
<td class="has-text-centered">0.399</td>
|
1757 |
+
<td class="has-text-centered">0.400</td>
|
1758 |
+
<td class="has-text-centered">0.400</td>
|
1759 |
+
<td class="has-text-centered">0.393</td>
|
1760 |
+
<td class="has-text-centered">0.642</td>
|
1761 |
+
<td class="has-text-centered performance-low">0.587</td>
|
1762 |
+
<td class="has-text-centered performance-medium">0.593</td>
|
1763 |
+
<td class="has-text-centered performance-low">0.587</td>
|
1764 |
+
<td class="has-text-centered">0.890</td>
|
1765 |
+
<td class="has-text-centered">0.895</td>
|
1766 |
+
<td class="has-text-centered">0.890</td>
|
1767 |
+
<td class="has-text-centered">0.885</td>
|
1768 |
+
</tr>
|
1769 |
+
<tr>
|
1770 |
+
<td>OpenAI gpt-4o</td>
|
1771 |
+
<td class="has-text-centered">0.184</td>
|
1772 |
+
<td class="has-text-centered">0.317</td>
|
1773 |
+
<td class="has-text-centered">-0.089</td>
|
1774 |
+
<td class="has-text-centered">0.537</td>
|
1775 |
+
<td class="has-text-centered">0.517</td>
|
1776 |
+
<td class="has-text-centered">0.517</td>
|
1777 |
+
<td class="has-text-centered">0.523</td>
|
1778 |
+
<td class="has-text-centered">0.639</td>
|
1779 |
+
<td class="has-text-centered">0.515</td>
|
1780 |
+
<td class="has-text-centered">0.541</td>
|
1781 |
+
<td class="has-text-centered">0.515</td>
|
1782 |
+
<td class="has-text-centered performance-low">0.929</td>
|
1783 |
+
<td class="has-text-centered performance-low">0.931</td>
|
1784 |
+
<td class="has-text-centered performance-low">0.929</td>
|
1785 |
+
<td class="has-text-centered performance-low">0.928</td>
|
1786 |
+
</tr>
|
1787 |
+
<tr>
|
1788 |
+
<td>OpenAI o1-mini</td>
|
1789 |
+
<td class="has-text-centered">0.120</td>
|
1790 |
+
<td class="has-text-centered">0.295</td>
|
1791 |
+
<td class="has-text-centered">0.289</td>
|
1792 |
+
<td class="has-text-centered performance-best">0.661</td>
|
1793 |
+
<td class="has-text-centered performance-best">0.681</td>
|
1794 |
+
<td class="has-text-centered performance-best">0.681</td>
|
1795 |
+
<td class="has-text-centered performance-best">0.662</td>
|
1796 |
+
<td class="has-text-centered performance-medium">0.660</td>
|
1797 |
+
<td class="has-text-centered">0.515</td>
|
1798 |
+
<td class="has-text-centered">0.542</td>
|
1799 |
+
<td class="has-text-centered">0.515</td>
|
1800 |
+
<td class="has-text-centered">0.918</td>
|
1801 |
+
<td class="has-text-centered">0.917</td>
|
1802 |
+
<td class="has-text-centered">0.918</td>
|
1803 |
+
<td class="has-text-centered">0.917</td>
|
1804 |
+
</tr>
|
1805 |
+
|
1806 |
</tbody>
|
1807 |
</table>
|
1808 |
<div class="content is-small mt-4">
|
|
|
1842 |
<th class="has-text-centered">Precision</th>
|
1843 |
<th class="has-text-centered">Recall</th>
|
1844 |
<th class="has-text-centered">F1</th>
|
|
|
1845 |
<th class="has-text-centered">Precision</th>
|
1846 |
<th class="has-text-centered">Recall</th>
|
1847 |
+
<th class="has-text-centered">Accuracy</th>
|
1848 |
<th class="has-text-centered">F1</th>
|
1849 |
<th class="has-text-centered">Accuracy</th>
|
1850 |
</tr>
|