diff --git "a/assets/results/aragen_v2_results.json" "b/assets/results/aragen_v2_results.json" --- "a/assets/results/aragen_v2_results.json" +++ "b/assets/results/aragen_v2_results.json" @@ -2,24 +2,504 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4882, - "Completeness": 0.4755, - "Conciseness": 0.1973, - "Helpfulness": 0.4659, - "Honesty": 0.4711, - "Harmlessness": 0.4875, - "3C3H Score": 0.4309 + "Correctness": 0.3735, + "Completeness": 0.3539, + "Conciseness": 0.1699, + "Helpfulness": 0.3554, + "Honesty": 0.3625, + "Harmlessness": 0.3735, + "3C3H Score": 0.3315 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1528, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7521, + "Reasoning": 0.7435 + } + }, + "Meta": { + "Model Name": "1024m/PHI-4-Hindi-4bit", + "License": "Open", + "Revision": "main", + "Precision": "4bit", + "Params": 14.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3147, + "Completeness": 0.2529, + "Conciseness": 0.2027, + "Helpfulness": 0.2713, + "Honesty": 0.2988, + "Harmlessness": 0.3088, + "3C3H Score": 0.2749 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1996, + "Orthographic and Grammatical Analysis": 0.0056, + "Safety": 0.7625, + "Reasoning": 0.3268 + } + }, + "Meta": { + "Model Name": "ALLaM-AI/ALLaM-7B-Instruct-preview", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 7.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2451, + "Completeness": 0.2059, + "Conciseness": 0.1282, + "Helpfulness": 0.2088, + "Honesty": 0.2375, + "Harmlessness": 0.2436, + "3C3H Score": 0.2115 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1927, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.4146, + "Reasoning": 0.2399 + } + }, + "Meta": { + "Model Name": "CohereForAI/aya-23-35B", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 35.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.1765, + "Completeness": 0.1461, + "Conciseness": 0.0929, + "Helpfulness": 0.1502, + "Honesty": 0.1725, + "Harmlessness": 0.1757, + "3C3H Score": 0.1523 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1296, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.4844, + "Reasoning": 0.0929 + } + }, + "Meta": { + "Model Name": "CohereForAI/aya-23-8B", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3795, + "Completeness": 0.3618, + "Conciseness": 0.1401, + "Helpfulness": 0.3545, + "Honesty": 0.3582, + "Harmlessness": 0.3744, + "3C3H Score": 0.3281 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2394, + "Orthographic and Grammatical Analysis": 0.0556, + "Safety": 0.6823, + "Reasoning": 0.4946 + } + }, + "Meta": { + "Model Name": "CohereForAI/aya-expanse-32b", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3029, + "Completeness": 0.2882, + "Conciseness": 0.1022, + "Helpfulness": 0.2841, + "Honesty": 0.2902, + "Harmlessness": 0.3015, + "3C3H Score": 0.2615 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.174, + "Orthographic and Grammatical Analysis": 0.0319, + "Safety": 0.6531, + "Reasoning": 0.3863 + } + }, + "Meta": { + "Model Name": "CohereForAI/aya-expanse-8b", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.5412, + "Completeness": 0.5275, + "Conciseness": 0.2047, + "Helpfulness": 0.5284, + "Honesty": 0.5287, + "Harmlessness": 0.5397, + "3C3H Score": 0.4783 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.3701, + "Orthographic and Grammatical Analysis": 0.1444, + "Safety": 0.7604, + "Reasoning": 0.7696 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-a-03-2025", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 111.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3235, + "Completeness": 0.2742, + "Conciseness": 0.162, + "Helpfulness": 0.2818, + "Honesty": 0.3119, + "Harmlessness": 0.3235, + "3C3H Score": 0.2795 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2439, + "Orthographic and Grammatical Analysis": 0.0333, + "Safety": 0.4042, + "Reasoning": 0.4143 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r-08-2024", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 338, + "Failed Entries": 2, + "Success Ratio": 0.9941 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3529, + "Completeness": 0.3137, + "Conciseness": 0.1652, + "Helpfulness": 0.3069, + "Honesty": 0.3363, + "Harmlessness": 0.3485, + "3C3H Score": 0.3039 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2773, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.3646, + "Reasoning": 0.4756 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r-plus-08-2024", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 104.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3667, + "Completeness": 0.302, + "Conciseness": 0.1968, + "Helpfulness": 0.3132, + "Honesty": 0.3559, + "Harmlessness": 0.3667, + "3C3H Score": 0.3169 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2866, + "Orthographic and Grammatical Analysis": 0.0639, + "Safety": 0.6469, + "Reasoning": 0.3232 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r-plus", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 104.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2517, + "Completeness": 0.2104, + "Conciseness": 0.115, + "Helpfulness": 0.2099, + "Honesty": 0.237, + "Harmlessness": 0.2495, + "3C3H Score": 0.2123 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2255, + "Orthographic and Grammatical Analysis": 0.0333, + "Safety": 0.2937, + "Reasoning": 0.2048 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r-v01", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "float16", + "Params": 35.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2363, + "Completeness": 0.2255, + "Conciseness": 0.1157, + "Helpfulness": 0.2238, + "Honesty": 0.2299, + "Harmlessness": 0.2363, + "3C3H Score": 0.2112 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1266, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.4261, + "Reasoning": 0.4208 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r7b-12-2024", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3206, + "Completeness": 0.3147, + "Conciseness": 0.1387, + "Helpfulness": 0.3103, + "Honesty": 0.3096, + "Harmlessness": 0.3199, + "3C3H Score": 0.2856 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1514, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.6552, + "Reasoning": 0.5804 + } + }, + "Meta": { + "Model Name": "CohereForAI/c4ai-command-r7b-arabic-02-2025", + "License": "cc-by-nc-4.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 8.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.1765, + "Completeness": 0.0931, + "Conciseness": 0.1333, + "Helpfulness": 0.1201, + "Honesty": 0.1681, + "Harmlessness": 0.175, + "3C3H Score": 0.1444 }, "Tasks Scores": { - "Question Answering (QA)": 0.2919, + "Question Answering (QA)": 0.1533, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7292, - "Reasoning": 0.8423 + "Safety": 0.3083, + "Reasoning": 0.0869 } }, "Meta": { - "Model Name": "Qwen/Qwen2.5-72B-Instruct", - "License": "qwen", + "Model Name": "FreedomIntelligence/AceGPT-v1.5-13B-Chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float32", + "Params": 13.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3598, + "Completeness": 0.2961, + "Conciseness": 0.2625, + "Helpfulness": 0.3208, + "Honesty": 0.3532, + "Harmlessness": 0.3591, + "3C3H Score": 0.3252 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.1946, + "Orthographic and Grammatical Analysis": 0.0333, + "Safety": 0.9083, + "Reasoning": 0.4905 + } + }, + "Meta": { + "Model Name": "FreedomIntelligence/AceGPT-v2-32B-Chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float16", + "Params": 32.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4343, + "Completeness": 0.3235, + "Conciseness": 0.3216, + "Helpfulness": 0.3755, + "Honesty": 0.424, + "Harmlessness": 0.4336, + "3C3H Score": 0.3854 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.3131, + "Orthographic and Grammatical Analysis": 0.025, + "Safety": 0.8875, + "Reasoning": 0.4595 + } + }, + "Meta": { + "Model Name": "FreedomIntelligence/AceGPT-v2-70B-Chat", + "License": "apache-2.0", + "Revision": "main", + "Precision": "float16", + "Params": 70.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4569, + "Completeness": 0.452, + "Conciseness": 0.1904, + "Helpfulness": 0.4365, + "Honesty": 0.4373, + "Harmlessness": 0.4554, + "3C3H Score": 0.4047 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2712, + "Orthographic and Grammatical Analysis": 0.0278, + "Safety": 0.8031, + "Reasoning": 0.7202 + } + }, + "Meta": { + "Model Name": "MaziyarPanahi/calme-2.1-qwen2.5-72b", + "License": "tongyi-qianwen", "Revision": "main", "Precision": "bfloat16", "Params": 72.0, @@ -32,27 +512,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4892, - "Completeness": 0.4451, - "Conciseness": 0.324, - "Helpfulness": 0.4667, - "Honesty": 0.4738, - "Harmlessness": 0.4885, - "3C3H Score": 0.4479 + "Correctness": 0.4745, + "Completeness": 0.4716, + "Conciseness": 0.2025, + "Helpfulness": 0.4603, + "Honesty": 0.4581, + "Harmlessness": 0.4745, + "3C3H Score": 0.4236 }, "Tasks Scores": { - "Question Answering (QA)": 0.2968, - "Orthographic and Grammatical Analysis": 0.0958, - "Safety": 0.951, - "Reasoning": 0.7429 + "Question Answering (QA)": 0.2809, + "Orthographic and Grammatical Analysis": 0.0542, + "Safety": 0.8011, + "Reasoning": 0.7738 } }, "Meta": { - "Model Name": "claude-3-5-haiku-20241022", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "MaziyarPanahi/calme-2.2-qwen2.5-72b", + "License": "tongyi-qianwen", + "Revision": "main", + "Precision": "bfloat16", + "Params": 72.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -62,27 +542,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.6049, - "Completeness": 0.5667, - "Conciseness": 0.3914, - "Helpfulness": 0.586, - "Honesty": 0.585, - "Harmlessness": 0.602, - "3C3H Score": 0.556 + "Correctness": 0.3088, + "Completeness": 0.2461, + "Conciseness": 0.1998, + "Helpfulness": 0.2674, + "Honesty": 0.2956, + "Harmlessness": 0.3081, + "3C3H Score": 0.271 }, "Tasks Scores": { - "Question Answering (QA)": 0.4152, - "Orthographic and Grammatical Analysis": 0.3625, - "Safety": 0.9687, - "Reasoning": 0.8054 + "Question Answering (QA)": 0.1979, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7854, + "Reasoning": 0.3018 } }, "Meta": { - "Model Name": "claude-3-5-sonnet-20241022", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "Mohaddz/Thinking-Camel-7b", + "License": "Open", + "Revision": "main", + "Precision": "float16", + "Params": 7.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -92,57 +572,117 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.6225, - "Completeness": 0.5853, - "Conciseness": 0.3449, - "Helpfulness": 0.6039, - "Honesty": 0.614, - "Harmlessness": 0.6218, - "3C3H Score": 0.5654 + "Correctness": 0.3108, + "Completeness": 0.2471, + "Conciseness": 0.2005, + "Helpfulness": 0.2672, + "Honesty": 0.299, + "Harmlessness": 0.31, + "3C3H Score": 0.2724 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2002, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7865, + "Reasoning": 0.3018 + } + }, + "Meta": { + "Model Name": "Mohaddz/Thinking-cow-7B", + "License": "Apache license 2.0", + "Revision": "main", + "Precision": "float16", + "Params": 7.0, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.3275, + "Completeness": 0.2284, + "Conciseness": 0.2463, + "Helpfulness": 0.2613, + "Honesty": 0.3159, + "Harmlessness": 0.3275, + "3C3H Score": 0.2845 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2005, + "Orthographic and Grammatical Analysis": 0.0444, + "Safety": 0.8302, + "Reasoning": 0.3155 + } + }, + "Meta": { + "Model Name": "Navid-AI/Yehia-7B-preview", + "License": "Open", + "Revision": "main", + "Precision": "bfloat16", + "Params": 6.524, + "Total Entries": 340, + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.2301, + "Completeness": 0.2173, + "Conciseness": 0.0376, + "Helpfulness": 0.1323, + "Honesty": 0.2117, + "Harmlessness": 0.2107, + "3C3H Score": 0.1733 }, "Tasks Scores": { - "Question Answering (QA)": 0.4179, - "Orthographic and Grammatical Analysis": 0.4042, - "Safety": 0.8698, - "Reasoning": 0.8821 + "Question Answering (QA)": 0.0706, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.5365, + "Reasoning": 0.3358 } }, "Meta": { - "Model Name": "claude-3-7-sonnet-20250219", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "Qwen/QwQ-32B-Preview", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 32.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.5755, - "Completeness": 0.5392, - "Conciseness": 0.2561, - "Helpfulness": 0.5495, - "Honesty": 0.5642, - "Harmlessness": 0.5755, - "3C3H Score": 0.51 + "Correctness": 0.3088, + "Completeness": 0.3069, + "Conciseness": 0.0137, + "Helpfulness": 0.223, + "Honesty": 0.2953, + "Harmlessness": 0.3074, + "3C3H Score": 0.2425 }, "Tasks Scores": { - "Question Answering (QA)": 0.4041, - "Orthographic and Grammatical Analysis": 0.1833, - "Safety": 0.7, - "Reasoning": 0.8441 + "Question Answering (QA)": 0.149, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.1906, + "Reasoning": 0.6435 } }, "Meta": { - "Model Name": "deepseek-chat", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "Qwen/QwQ-32B", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 32.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -152,57 +692,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.6314, - "Completeness": 0.5667, - "Conciseness": 0.3995, - "Helpfulness": 0.5966, - "Honesty": 0.6179, - "Harmlessness": 0.6306, - "3C3H Score": 0.5738 + "Correctness": 0.0944, + "Completeness": 0.0855, + "Conciseness": 0.0339, + "Helpfulness": 0.0723, + "Honesty": 0.0819, + "Harmlessness": 0.0878, + "3C3H Score": 0.076 }, "Tasks Scores": { - "Question Answering (QA)": 0.4704, - "Orthographic and Grammatical Analysis": 0.2306, - "Safety": 0.9021, - "Reasoning": 0.8286 + "Question Answering (QA)": 0.0469, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.399, + "Reasoning": 0.0065 } }, "Meta": { - "Model Name": "gpt-4o-2024-08-06", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "Qwen/Qwen2.5-0.5B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 0.465, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.451, - "Completeness": 0.4088, - "Conciseness": 0.276, - "Helpfulness": 0.4206, - "Honesty": 0.4358, - "Harmlessness": 0.4451, - "3C3H Score": 0.4062 + "Correctness": 0.1882, + "Completeness": 0.1882, + "Conciseness": 0.1096, + "Helpfulness": 0.1596, + "Honesty": 0.1846, + "Harmlessness": 0.1846, + "3C3H Score": 0.1691 }, "Tasks Scores": { - "Question Answering (QA)": 0.2562, - "Orthographic and Grammatical Analysis": 0.0361, - "Safety": 0.8677, - "Reasoning": 0.7298 + "Question Answering (QA)": 0.0465, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.6979, + "Reasoning": 0.2899 } }, "Meta": { - "Model Name": "gpt-4o-mini-2024-07-18", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "Qwen/Qwen2.5-1.5B-Instruct", + "License": "qwen", + "Revision": "main", + "Precision": "bfloat16", + "Params": 1.443, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -212,27 +752,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.7588, - "Completeness": 0.7098, - "Conciseness": 0.5125, - "Helpfulness": 0.7255, - "Honesty": 0.7525, - "Harmlessness": 0.7559, - "3C3H Score": 0.7025 + "Correctness": 0.3833, + "Completeness": 0.3647, + "Conciseness": 0.1978, + "Helpfulness": 0.3652, + "Honesty": 0.376, + "Harmlessness": 0.3826, + "3C3H Score": 0.3449 }, "Tasks Scores": { - "Question Answering (QA)": 0.6051, - "Orthographic and Grammatical Analysis": 0.4528, - "Safety": 0.9437, - "Reasoning": 0.95 + "Question Answering (QA)": 0.1585, + "Orthographic and Grammatical Analysis": 0.0306, + "Safety": 0.8281, + "Reasoning": 0.7363 } }, "Meta": { - "Model Name": "o1-2024-12-17", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "Qwen/Qwen2.5-14B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 14.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -242,27 +782,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4755, - "Completeness": 0.4676, - "Conciseness": 0.2804, - "Helpfulness": 0.4627, - "Honesty": 0.4667, - "Harmlessness": 0.474, - "3C3H Score": 0.4378 + "Correctness": 0.4235, + "Completeness": 0.3922, + "Conciseness": 0.2162, + "Helpfulness": 0.3971, + "Honesty": 0.4132, + "Harmlessness": 0.4223, + "3C3H Score": 0.3774 }, "Tasks Scores": { - "Question Answering (QA)": 0.2435, - "Orthographic and Grammatical Analysis": 0.0292, - "Safety": 0.8958, - "Reasoning": 0.9065 + "Question Answering (QA)": 0.2031, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8188, + "Reasoning": 0.7851 } }, "Meta": { - "Model Name": "o1-mini-2024-09-12", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "Qwen/Qwen2.5-32B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 32.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -272,27 +812,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.5608, - "Completeness": 0.5235, - "Conciseness": 0.3672, - "Helpfulness": 0.5353, - "Honesty": 0.551, - "Harmlessness": 0.56, - "3C3H Score": 0.5163 + "Correctness": 0.2598, + "Completeness": 0.2598, + "Conciseness": 0.1304, + "Helpfulness": 0.2431, + "Honesty": 0.2559, + "Harmlessness": 0.2561, + "3C3H Score": 0.2342 }, "Tasks Scores": { - "Question Answering (QA)": 0.3458, - "Orthographic and Grammatical Analysis": 0.0875, - "Safety": 0.9448, - "Reasoning": 0.9423 + "Question Answering (QA)": 0.0665, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8646, + "Reasoning": 0.4536 } }, "Meta": { - "Model Name": "o3-mini-2025-01-31", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "Qwen/Qwen2.5-3B-Instruct", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 3.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -302,27 +842,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3088, - "Completeness": 0.2461, - "Conciseness": 0.1998, - "Helpfulness": 0.2674, - "Honesty": 0.2956, - "Harmlessness": 0.3081, - "3C3H Score": 0.271 + "Correctness": 0.4882, + "Completeness": 0.4755, + "Conciseness": 0.1973, + "Helpfulness": 0.4659, + "Honesty": 0.4711, + "Harmlessness": 0.4875, + "3C3H Score": 0.4309 }, "Tasks Scores": { - "Question Answering (QA)": 0.1979, + "Question Answering (QA)": 0.2919, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7854, - "Reasoning": 0.3018 + "Safety": 0.7292, + "Reasoning": 0.8423 } }, "Meta": { - "Model Name": "Mohaddz/Thinking-Camel-7b", - "License": "Open", + "Model Name": "Qwen/Qwen2.5-72B-Instruct", + "License": "qwen", "Revision": "main", - "Precision": "float16", - "Params": 7.0, + "Precision": "bfloat16", + "Params": 72.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -332,27 +872,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3735, - "Completeness": 0.3539, - "Conciseness": 0.1699, - "Helpfulness": 0.3554, - "Honesty": 0.3625, - "Harmlessness": 0.3735, - "3C3H Score": 0.3315 + "Correctness": 0.3275, + "Completeness": 0.3108, + "Conciseness": 0.1395, + "Helpfulness": 0.3081, + "Honesty": 0.3174, + "Harmlessness": 0.326, + "3C3H Score": 0.2882 }, "Tasks Scores": { - "Question Answering (QA)": 0.1528, + "Question Answering (QA)": 0.1199, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7521, - "Reasoning": 0.7435 + "Safety": 0.7729, + "Reasoning": 0.6155 } }, "Meta": { - "Model Name": "1024m/PHI-4-Hindi-4bit", - "License": "Open", + "Model Name": "Qwen/Qwen2.5-7B-Instruct", + "License": "apache-2.0", "Revision": "main", - "Precision": "4bit", - "Params": 14.0, + "Precision": "bfloat16", + "Params": 7.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -362,27 +902,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3147, - "Completeness": 0.2529, - "Conciseness": 0.2027, - "Helpfulness": 0.2713, - "Honesty": 0.2988, - "Harmlessness": 0.3088, - "3C3H Score": 0.2749 + "Correctness": 0.4892, + "Completeness": 0.4451, + "Conciseness": 0.324, + "Helpfulness": 0.4667, + "Honesty": 0.4738, + "Harmlessness": 0.4885, + "3C3H Score": 0.4479 }, "Tasks Scores": { - "Question Answering (QA)": 0.1996, - "Orthographic and Grammatical Analysis": 0.0056, - "Safety": 0.7625, - "Reasoning": 0.3268 + "Question Answering (QA)": 0.2968, + "Orthographic and Grammatical Analysis": 0.0958, + "Safety": 0.951, + "Reasoning": 0.7429 } }, "Meta": { - "Model Name": "ALLaM-AI/ALLaM-7B-Instruct-preview", - "License": "apache-2.0", - "Revision": "main", - "Precision": "bfloat16", - "Params": 7.0, + "Model Name": "claude-3-5-haiku-20241022", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -392,27 +932,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2451, - "Completeness": 0.2059, - "Conciseness": 0.1282, - "Helpfulness": 0.2088, - "Honesty": 0.2375, - "Harmlessness": 0.2436, - "3C3H Score": 0.2115 + "Correctness": 0.6049, + "Completeness": 0.5667, + "Conciseness": 0.3914, + "Helpfulness": 0.586, + "Honesty": 0.585, + "Harmlessness": 0.602, + "3C3H Score": 0.556 }, "Tasks Scores": { - "Question Answering (QA)": 0.1927, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.4146, - "Reasoning": 0.2399 + "Question Answering (QA)": 0.4152, + "Orthographic and Grammatical Analysis": 0.3625, + "Safety": 0.9687, + "Reasoning": 0.8054 } }, "Meta": { - "Model Name": "CohereForAI/aya-23-35B", - "License": "cc-by-nc-4.0", - "Revision": "main", - "Precision": "float16", - "Params": 35.0, + "Model Name": "claude-3-5-sonnet-20241022", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -422,27 +962,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.1765, - "Completeness": 0.1461, - "Conciseness": 0.0929, - "Helpfulness": 0.1502, - "Honesty": 0.1725, - "Harmlessness": 0.1757, - "3C3H Score": 0.1523 + "Correctness": 0.6225, + "Completeness": 0.5853, + "Conciseness": 0.3449, + "Helpfulness": 0.6039, + "Honesty": 0.614, + "Harmlessness": 0.6218, + "3C3H Score": 0.5654 }, "Tasks Scores": { - "Question Answering (QA)": 0.1296, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.4844, - "Reasoning": 0.0929 + "Question Answering (QA)": 0.4179, + "Orthographic and Grammatical Analysis": 0.4042, + "Safety": 0.8698, + "Reasoning": 0.8821 } }, "Meta": { - "Model Name": "CohereForAI/aya-23-8B", - "License": "cc-by-nc-4.0", - "Revision": "main", - "Precision": "float16", - "Params": 8.0, + "Model Name": "claude-3-7-sonnet-20250219", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -452,57 +992,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3795, - "Completeness": 0.3618, - "Conciseness": 0.1401, - "Helpfulness": 0.3545, - "Honesty": 0.3582, - "Harmlessness": 0.3744, - "3C3H Score": 0.3281 + "Correctness": 0.4098, + "Completeness": 0.3539, + "Conciseness": 0.2368, + "Helpfulness": 0.3792, + "Honesty": 0.3887, + "Harmlessness": 0.4098, + "3C3H Score": 0.363 }, "Tasks Scores": { - "Question Answering (QA)": 0.2394, - "Orthographic and Grammatical Analysis": 0.0556, - "Safety": 0.6823, - "Reasoning": 0.4946 + "Question Answering (QA)": 0.2707, + "Orthographic and Grammatical Analysis": 0.0514, + "Safety": 0.8927, + "Reasoning": 0.4577 } }, "Meta": { - "Model Name": "CohereForAI/aya-expanse-32b", - "License": "cc-by-nc-4.0", - "Revision": "main", - "Precision": "float16", - "Params": 32.0, + "Model Name": "claude-3-haiku-20240307", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, - "Successful Entries": 339, - "Failed Entries": 1, - "Success Ratio": 0.9971 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3029, - "Completeness": 0.2882, - "Conciseness": 0.1022, - "Helpfulness": 0.2841, - "Honesty": 0.2902, - "Harmlessness": 0.3015, - "3C3H Score": 0.2615 + "Correctness": 0.5755, + "Completeness": 0.5392, + "Conciseness": 0.2561, + "Helpfulness": 0.5495, + "Honesty": 0.5642, + "Harmlessness": 0.5755, + "3C3H Score": 0.51 }, "Tasks Scores": { - "Question Answering (QA)": 0.174, - "Orthographic and Grammatical Analysis": 0.0319, - "Safety": 0.6531, - "Reasoning": 0.3863 + "Question Answering (QA)": 0.4041, + "Orthographic and Grammatical Analysis": 0.1833, + "Safety": 0.7, + "Reasoning": 0.8441 } }, "Meta": { - "Model Name": "CohereForAI/aya-expanse-8b", - "License": "cc-by-nc-4.0", - "Revision": "main", - "Precision": "float16", - "Params": 8.0, + "Model Name": "deepseek-chat-v3", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -512,27 +1052,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.5412, - "Completeness": 0.5275, - "Conciseness": 0.2047, - "Helpfulness": 0.5284, - "Honesty": 0.5287, - "Harmlessness": 0.5397, - "3C3H Score": 0.4783 + "Correctness": 0.3931, + "Completeness": 0.3765, + "Conciseness": 0.211, + "Helpfulness": 0.377, + "Honesty": 0.3843, + "Harmlessness": 0.3931, + "3C3H Score": 0.3558 }, "Tasks Scores": { - "Question Answering (QA)": 0.3701, - "Orthographic and Grammatical Analysis": 0.1444, - "Safety": 0.7604, - "Reasoning": 0.7696 + "Question Answering (QA)": 0.2201, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8865, + "Reasoning": 0.5929 } }, "Meta": { - "Model Name": "CohereForAI/c4ai-command-a-03-2025", - "License": "cc-by-nc-4.0", + "Model Name": "google/gemma-2-27b-it", + "License": "gemma", "Revision": "main", "Precision": "bfloat16", - "Params": 111.0, + "Params": 27.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -542,117 +1082,117 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3235, - "Completeness": 0.2742, - "Conciseness": 0.162, - "Helpfulness": 0.2818, - "Honesty": 0.3119, - "Harmlessness": 0.3235, - "3C3H Score": 0.2795 + "Correctness": 0.3343, + "Completeness": 0.3196, + "Conciseness": 0.1861, + "Helpfulness": 0.323, + "Honesty": 0.3294, + "Harmlessness": 0.3336, + "3C3H Score": 0.3043 }, "Tasks Scores": { - "Question Answering (QA)": 0.2439, - "Orthographic and Grammatical Analysis": 0.0333, - "Safety": 0.4042, - "Reasoning": 0.4143 + "Question Answering (QA)": 0.1633, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8875, + "Reasoning": 0.5072 } }, "Meta": { - "Model Name": "CohereForAI/c4ai-command-r-08-2024", - "License": "cc-by-nc-4.0", + "Model Name": "google/gemma-2-9b-it", + "License": "gemma", "Revision": "main", - "Precision": "float16", - "Params": 32.0, + "Precision": "bfloat16", + "Params": 9.0, "Total Entries": 340, - "Successful Entries": 338, - "Failed Entries": 2, - "Success Ratio": 0.9941 + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3529, - "Completeness": 0.3137, - "Conciseness": 0.1652, - "Helpfulness": 0.3069, - "Honesty": 0.3363, - "Harmlessness": 0.3485, - "3C3H Score": 0.3039 + "Correctness": 0.4888, + "Completeness": 0.4792, + "Conciseness": 0.1976, + "Helpfulness": 0.4662, + "Honesty": 0.4702, + "Harmlessness": 0.488, + "3C3H Score": 0.4317 }, "Tasks Scores": { - "Question Answering (QA)": 0.2773, + "Question Answering (QA)": 0.2443, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.3646, - "Reasoning": 0.4756 + "Safety": 0.7927, + "Reasoning": 0.8 } }, "Meta": { - "Model Name": "CohereForAI/c4ai-command-r-plus-08-2024", - "License": "cc-by-nc-4.0", + "Model Name": "google/gemma-3-12b-it", + "License": "gemma", "Revision": "main", - "Precision": "float16", - "Params": 104.0, + "Precision": "bfloat16", + "Params": 12.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 313, + "Failed Entries": 27, + "Success Ratio": 0.9206 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3667, - "Completeness": 0.302, - "Conciseness": 0.1968, - "Helpfulness": 0.3132, - "Honesty": 0.3559, - "Harmlessness": 0.3667, - "3C3H Score": 0.3169 + "Correctness": 0.2101, + "Completeness": 0.2041, + "Conciseness": 0.0466, + "Helpfulness": 0.1834, + "Honesty": 0.1997, + "Harmlessness": 0.2034, + "3C3H Score": 0.1746 }, "Tasks Scores": { - "Question Answering (QA)": 0.2866, - "Orthographic and Grammatical Analysis": 0.0639, - "Safety": 0.6469, - "Reasoning": 0.3232 + "Question Answering (QA)": 0.0694, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7292, + "Reasoning": 0.2298 } }, "Meta": { - "Model Name": "CohereForAI/c4ai-command-r-plus", - "License": "cc-by-nc-4.0", + "Model Name": "google/gemma-3-1b-it", + "License": "gemma", "Revision": "main", - "Precision": "float16", - "Params": 104.0, + "Precision": "bfloat16", + "Params": 1.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 338, + "Failed Entries": 2, + "Success Ratio": 0.9941 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2517, - "Completeness": 0.2104, - "Conciseness": 0.115, - "Helpfulness": 0.2099, - "Honesty": 0.237, - "Harmlessness": 0.2495, - "3C3H Score": 0.2123 + "Correctness": 0.5231, + "Completeness": 0.5064, + "Conciseness": 0.1868, + "Helpfulness": 0.4939, + "Honesty": 0.5044, + "Harmlessness": 0.5172, + "3C3H Score": 0.4553 }, "Tasks Scores": { - "Question Answering (QA)": 0.2255, - "Orthographic and Grammatical Analysis": 0.0333, - "Safety": 0.2937, - "Reasoning": 0.2048 + "Question Answering (QA)": 0.3213, + "Orthographic and Grammatical Analysis": 0.0292, + "Safety": 0.7724, + "Reasoning": 0.8441 } }, "Meta": { - "Model Name": "CohereForAI/c4ai-command-r-v01", - "License": "cc-by-nc-4.0", + "Model Name": "google/gemma-3-27b-it", + "License": "gemma", "Revision": "main", - "Precision": "float16", - "Params": 35.0, + "Precision": "bfloat16", + "Params": 27.0, "Total Entries": 340, "Successful Entries": 339, "Failed Entries": 1, @@ -662,27 +1202,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4569, - "Completeness": 0.452, - "Conciseness": 0.1904, - "Helpfulness": 0.4365, - "Honesty": 0.4373, - "Harmlessness": 0.4554, - "3C3H Score": 0.4047 + "Correctness": 0.3392, + "Completeness": 0.3363, + "Conciseness": 0.1088, + "Helpfulness": 0.3186, + "Honesty": 0.3316, + "Harmlessness": 0.337, + "3C3H Score": 0.2953 }, "Tasks Scores": { - "Question Answering (QA)": 0.2712, - "Orthographic and Grammatical Analysis": 0.0278, - "Safety": 0.8031, - "Reasoning": 0.7202 + "Question Answering (QA)": 0.1067, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8229, + "Reasoning": 0.6589 } }, "Meta": { - "Model Name": "MaziyarPanahi/calme-2.1-qwen2.5-72b", - "License": "tongyi-qianwen", + "Model Name": "google/gemma-3-4b-it", + "License": "gemma", "Revision": "main", "Precision": "bfloat16", - "Params": 72.0, + "Params": 4.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -692,57 +1232,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4745, - "Completeness": 0.4716, - "Conciseness": 0.2025, - "Helpfulness": 0.4603, - "Honesty": 0.4581, - "Harmlessness": 0.4745, - "3C3H Score": 0.4236 + "Correctness": 0.3304, + "Completeness": 0.2832, + "Conciseness": 0.1927, + "Helpfulness": 0.2898, + "Honesty": 0.3142, + "Harmlessness": 0.3267, + "3C3H Score": 0.2895 }, "Tasks Scores": { - "Question Answering (QA)": 0.2809, - "Orthographic and Grammatical Analysis": 0.0542, - "Safety": 0.8011, - "Reasoning": 0.7738 + "Question Answering (QA)": 0.2124, + "Orthographic and Grammatical Analysis": 0.0194, + "Safety": 0.8448, + "Reasoning": 0.3071 } }, "Meta": { - "Model Name": "MaziyarPanahi/calme-2.2-qwen2.5-72b", - "License": "tongyi-qianwen", + "Model Name": "inceptionai/jais-adapted-13b-chat", + "License": "apache-2.0", "Revision": "main", - "Precision": "bfloat16", - "Params": 72.0, + "Precision": "float32", + "Params": 13.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3108, - "Completeness": 0.2471, - "Conciseness": 0.2005, - "Helpfulness": 0.2672, - "Honesty": 0.299, - "Harmlessness": 0.31, - "3C3H Score": 0.2724 + "Correctness": 0.4206, + "Completeness": 0.3716, + "Conciseness": 0.1875, + "Helpfulness": 0.3752, + "Honesty": 0.3912, + "Harmlessness": 0.4199, + "3C3H Score": 0.361 }, "Tasks Scores": { - "Question Answering (QA)": 0.2002, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7865, - "Reasoning": 0.3018 + "Question Answering (QA)": 0.2878, + "Orthographic and Grammatical Analysis": 0.0306, + "Safety": 0.8188, + "Reasoning": 0.45 } }, "Meta": { - "Model Name": "Mohaddz/Thinking-cow-7B", - "License": "Apache license 2.0", + "Model Name": "inceptionai/jais-adapted-70b-chat", + "License": "apache-2.0", "Revision": "main", - "Precision": "float16", - "Params": 7.0, + "Precision": "float32", + "Params": 70.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -752,27 +1292,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3275, - "Completeness": 0.2284, - "Conciseness": 0.2463, - "Helpfulness": 0.2613, - "Honesty": 0.3159, - "Harmlessness": 0.3275, - "3C3H Score": 0.2845 + "Correctness": 0.2627, + "Completeness": 0.2392, + "Conciseness": 0.1206, + "Helpfulness": 0.2424, + "Honesty": 0.2468, + "Harmlessness": 0.2627, + "3C3H Score": 0.2291 }, "Tasks Scores": { - "Question Answering (QA)": 0.2005, - "Orthographic and Grammatical Analysis": 0.0444, - "Safety": 0.8302, - "Reasoning": 0.3155 + "Question Answering (QA)": 0.1511, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7479, + "Reasoning": 0.2536 } }, "Meta": { - "Model Name": "Navid-AI/Yehia-7B-preview", - "License": "Open", + "Model Name": "inceptionai/jais-family-13b-chat", + "License": "apache-2.0", "Revision": "main", - "Precision": "bfloat16", - "Params": 6.524, + "Precision": "float32", + "Params": 13.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -782,57 +1322,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2301, - "Completeness": 0.2173, - "Conciseness": 0.0376, - "Helpfulness": 0.1323, - "Honesty": 0.2117, - "Harmlessness": 0.2107, - "3C3H Score": 0.1733 + "Correctness": 0.1667, + "Completeness": 0.1627, + "Conciseness": 0.0603, + "Helpfulness": 0.1392, + "Honesty": 0.1439, + "Harmlessness": 0.1615, + "3C3H Score": 0.1391 }, "Tasks Scores": { - "Question Answering (QA)": 0.0706, + "Question Answering (QA)": 0.0885, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.5365, - "Reasoning": 0.3358 + "Safety": 0.3938, + "Reasoning": 0.1976 } }, "Meta": { - "Model Name": "Qwen/QwQ-32B-Preview", + "Model Name": "inceptionai/jais-family-1p3b-chat", "License": "apache-2.0", "Revision": "main", - "Precision": "bfloat16", - "Params": 32.0, + "Precision": "float32", + "Params": 1.0, "Total Entries": 340, - "Successful Entries": 339, - "Failed Entries": 1, - "Success Ratio": 0.9971 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3088, - "Completeness": 0.3069, - "Conciseness": 0.0137, - "Helpfulness": 0.223, - "Honesty": 0.2953, - "Harmlessness": 0.3074, - "3C3H Score": 0.2425 + "Correctness": 0.2108, + "Completeness": 0.1971, + "Conciseness": 0.077, + "Helpfulness": 0.1828, + "Honesty": 0.189, + "Harmlessness": 0.2064, + "3C3H Score": 0.1772 }, "Tasks Scores": { - "Question Answering (QA)": 0.149, + "Question Answering (QA)": 0.111, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.1906, - "Reasoning": 0.6435 + "Safety": 0.7052, + "Reasoning": 0.1405 } }, "Meta": { - "Model Name": "Qwen/QwQ-32B", + "Model Name": "inceptionai/jais-family-2p7b-chat", "License": "apache-2.0", "Revision": "main", - "Precision": "bfloat16", - "Params": 32.0, + "Precision": "float32", + "Params": 3.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -842,27 +1382,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.0944, - "Completeness": 0.0855, - "Conciseness": 0.0339, - "Helpfulness": 0.0723, - "Honesty": 0.0819, - "Harmlessness": 0.0878, - "3C3H Score": 0.076 + "Correctness": 0.3048, + "Completeness": 0.2793, + "Conciseness": 0.1362, + "Helpfulness": 0.2778, + "Honesty": 0.282, + "Harmlessness": 0.3041, + "3C3H Score": 0.264 }, "Tasks Scores": { - "Question Answering (QA)": 0.0469, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.399, - "Reasoning": 0.0065 + "Question Answering (QA)": 0.1863, + "Orthographic and Grammatical Analysis": 0.0222, + "Safety": 0.7521, + "Reasoning": 0.3095 } }, "Meta": { - "Model Name": "Qwen/Qwen2.5-0.5B-Instruct", + "Model Name": "inceptionai/jais-family-30b-16k-chat", "License": "apache-2.0", "Revision": "main", - "Precision": "bfloat16", - "Params": 0.465, + "Precision": "float32", + "Params": 30.0, "Total Entries": 340, "Successful Entries": 339, "Failed Entries": 1, @@ -872,27 +1412,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.1882, - "Completeness": 0.1882, - "Conciseness": 0.1096, - "Helpfulness": 0.1596, - "Honesty": 0.1846, - "Harmlessness": 0.1846, - "3C3H Score": 0.1691 + "Correctness": 0.2784, + "Completeness": 0.2569, + "Conciseness": 0.1275, + "Helpfulness": 0.2485, + "Honesty": 0.2632, + "Harmlessness": 0.2755, + "3C3H Score": 0.2417 }, "Tasks Scores": { - "Question Answering (QA)": 0.0465, + "Question Answering (QA)": 0.1665, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.6979, - "Reasoning": 0.2899 + "Safety": 0.7177, + "Reasoning": 0.2881 } }, "Meta": { - "Model Name": "Qwen/Qwen2.5-1.5B-Instruct", - "License": "qwen", + "Model Name": "inceptionai/jais-family-30b-8k-chat", + "License": "apache-2.0", "Revision": "main", - "Precision": "bfloat16", - "Params": 1.443, + "Precision": "float32", + "Params": 30.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -902,27 +1442,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3833, - "Completeness": 0.3647, - "Conciseness": 0.1978, - "Helpfulness": 0.3652, - "Honesty": 0.376, - "Harmlessness": 0.3826, - "3C3H Score": 0.3449 + "Correctness": 0.0725, + "Completeness": 0.0637, + "Conciseness": 0.0228, + "Helpfulness": 0.0483, + "Honesty": 0.0556, + "Harmlessness": 0.0713, + "3C3H Score": 0.0557 }, "Tasks Scores": { - "Question Answering (QA)": 0.1585, - "Orthographic and Grammatical Analysis": 0.0306, - "Safety": 0.8281, - "Reasoning": 0.7363 + "Question Answering (QA)": 0.046, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.174, + "Reasoning": 0.0399 } }, "Meta": { - "Model Name": "Qwen/Qwen2.5-14B-Instruct", + "Model Name": "inceptionai/jais-family-590m-chat", "License": "apache-2.0", "Revision": "main", - "Precision": "bfloat16", - "Params": 14.0, + "Precision": "float32", + "Params": 0.719, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -932,27 +1472,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4235, - "Completeness": 0.3922, - "Conciseness": 0.2162, - "Helpfulness": 0.3971, - "Honesty": 0.4132, - "Harmlessness": 0.4223, - "3C3H Score": 0.3774 + "Correctness": 0.2275, + "Completeness": 0.1961, + "Conciseness": 0.0995, + "Helpfulness": 0.2029, + "Honesty": 0.2078, + "Harmlessness": 0.2238, + "3C3H Score": 0.1929 }, "Tasks Scores": { - "Question Answering (QA)": 0.2031, + "Question Answering (QA)": 0.1413, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.8188, - "Reasoning": 0.7851 + "Safety": 0.6208, + "Reasoning": 0.1786 } }, "Meta": { - "Model Name": "Qwen/Qwen2.5-32B-Instruct", + "Model Name": "inceptionai/jais-family-6p7b-chat", "License": "apache-2.0", "Revision": "main", - "Precision": "bfloat16", - "Params": 32.0, + "Precision": "float32", + "Params": 7.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -962,27 +1502,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2598, - "Completeness": 0.2598, - "Conciseness": 0.1304, - "Helpfulness": 0.2431, - "Honesty": 0.2559, - "Harmlessness": 0.2561, - "3C3H Score": 0.2342 + "Correctness": 0.0029, + "Completeness": 0.0029, + "Conciseness": 0.0, + "Helpfulness": 0.0007, + "Honesty": 0.0029, + "Harmlessness": 0.0029, + "3C3H Score": 0.0021 }, "Tasks Scores": { - "Question Answering (QA)": 0.0665, + "Question Answering (QA)": 0.0035, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.8646, - "Reasoning": 0.4536 + "Safety": 0.0, + "Reasoning": 0.0 } }, "Meta": { - "Model Name": "Qwen/Qwen2.5-3B-Instruct", - "License": "apache-2.0", + "Model Name": "kyutai/helium-1-preview-2b", + "License": "cc-by-4.0", "Revision": "main", "Precision": "bfloat16", - "Params": 3.0, + "Params": 2.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -992,57 +1532,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3304, - "Completeness": 0.2832, - "Conciseness": 0.1927, - "Helpfulness": 0.2898, - "Honesty": 0.3142, - "Harmlessness": 0.3267, - "3C3H Score": 0.2895 - }, - "Tasks Scores": { - "Question Answering (QA)": 0.2124, - "Orthographic and Grammatical Analysis": 0.0194, - "Safety": 0.8448, - "Reasoning": 0.3071 - } - }, - "Meta": { - "Model Name": "inceptionai/jais-adapted-13b-chat", - "License": "apache-2.0", - "Revision": "main", - "Precision": "float32", - "Params": 13.0, - "Total Entries": 340, - "Successful Entries": 339, - "Failed Entries": 1, - "Success Ratio": 0.9971 - } - }, - { - "claude-3.5-sonnet Scores": { - "3C3H Scores": { - "Correctness": 0.4206, - "Completeness": 0.3716, - "Conciseness": 0.1875, - "Helpfulness": 0.3752, - "Honesty": 0.3912, - "Harmlessness": 0.4199, - "3C3H Score": 0.361 + "Correctness": 0.4029, + "Completeness": 0.3804, + "Conciseness": 0.1877, + "Helpfulness": 0.3748, + "Honesty": 0.3882, + "Harmlessness": 0.3983, + "3C3H Score": 0.3554 }, "Tasks Scores": { - "Question Answering (QA)": 0.2878, - "Orthographic and Grammatical Analysis": 0.0306, - "Safety": 0.8188, - "Reasoning": 0.45 + "Question Answering (QA)": 0.1775, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7729, + "Reasoning": 0.7774 } }, "Meta": { - "Model Name": "inceptionai/jais-adapted-70b-chat", - "License": "apache-2.0", + "Model Name": "maldv/Qwentile2.5-32B-Instruct", + "License": "Open", "Revision": "main", - "Precision": "float32", - "Params": 70.0, + "Precision": "float16", + "Params": 32.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1052,57 +1562,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2627, - "Completeness": 0.2392, - "Conciseness": 0.1206, - "Helpfulness": 0.2424, - "Honesty": 0.2468, - "Harmlessness": 0.2627, - "3C3H Score": 0.2291 + "Correctness": 0.3931, + "Completeness": 0.3441, + "Conciseness": 0.2596, + "Helpfulness": 0.361, + "Honesty": 0.3784, + "Harmlessness": 0.3895, + "3C3H Score": 0.3543 }, "Tasks Scores": { - "Question Answering (QA)": 0.1511, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7479, - "Reasoning": 0.2536 + "Question Answering (QA)": 0.2044, + "Orthographic and Grammatical Analysis": 0.0333, + "Safety": 0.8719, + "Reasoning": 0.6244 } }, "Meta": { - "Model Name": "inceptionai/jais-family-13b-chat", - "License": "apache-2.0", + "Model Name": "malhajar/Shahin-v0.1", + "License": "Open", "Revision": "main", - "Precision": "float32", - "Params": 13.0, + "Precision": "float16", + "Params": 27.519, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, "Success Ratio": 1.0 - } - }, - { - "claude-3.5-sonnet Scores": { - "3C3H Scores": { - "Correctness": 0.2108, - "Completeness": 0.1971, - "Conciseness": 0.077, - "Helpfulness": 0.1828, - "Honesty": 0.189, - "Harmlessness": 0.2064, - "3C3H Score": 0.1772 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4225, + "Completeness": 0.3569, + "Conciseness": 0.3252, + "Helpfulness": 0.3777, + "Honesty": 0.4147, + "Harmlessness": 0.4218, + "3C3H Score": 0.3865 }, "Tasks Scores": { - "Question Answering (QA)": 0.111, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7052, - "Reasoning": 0.1405 + "Question Answering (QA)": 0.2353, + "Orthographic and Grammatical Analysis": 0.025, + "Safety": 0.8542, + "Reasoning": 0.706 } }, "Meta": { - "Model Name": "inceptionai/jais-family-2p7b-chat", - "License": "apache-2.0", + "Model Name": "meta-llama/Llama-3.1-70B-Instruct", + "License": "llama3.1", "Revision": "main", - "Precision": "float32", - "Params": 3.0, + "Precision": "bfloat16", + "Params": 70.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1112,57 +1622,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3048, - "Completeness": 0.2793, - "Conciseness": 0.1362, - "Helpfulness": 0.2778, - "Honesty": 0.282, - "Harmlessness": 0.3041, - "3C3H Score": 0.264 + "Correctness": 0.2971, + "Completeness": 0.2686, + "Conciseness": 0.1968, + "Helpfulness": 0.261, + "Honesty": 0.2814, + "Harmlessness": 0.2971, + "3C3H Score": 0.267 }, "Tasks Scores": { - "Question Answering (QA)": 0.1863, - "Orthographic and Grammatical Analysis": 0.0222, - "Safety": 0.7521, - "Reasoning": 0.3095 + "Question Answering (QA)": 0.1176, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.8792, + "Reasoning": 0.4583 } }, "Meta": { - "Model Name": "inceptionai/jais-family-30b-16k-chat", - "License": "apache-2.0", + "Model Name": "meta-llama/Llama-3.1-8B-Instruct", + "License": "llama3.1", "Revision": "main", - "Precision": "float32", - "Params": 30.0, + "Precision": "bfloat16", + "Params": 8.0, "Total Entries": 340, - "Successful Entries": 339, - "Failed Entries": 1, - "Success Ratio": 0.9971 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2784, - "Completeness": 0.2569, - "Conciseness": 0.1275, - "Helpfulness": 0.2485, - "Honesty": 0.2632, - "Harmlessness": 0.2755, - "3C3H Score": 0.2417 + "Correctness": 0.1353, + "Completeness": 0.1176, + "Conciseness": 0.0875, + "Helpfulness": 0.1007, + "Honesty": 0.1213, + "Harmlessness": 0.1301, + "3C3H Score": 0.1154 }, "Tasks Scores": { - "Question Answering (QA)": 0.1665, + "Question Answering (QA)": 0.0479, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7177, - "Reasoning": 0.2881 + "Safety": 0.5875, + "Reasoning": 0.0881 } }, "Meta": { - "Model Name": "inceptionai/jais-family-30b-8k-chat", - "License": "apache-2.0", + "Model Name": "meta-llama/Llama-3.2-1B-Instruct", + "License": "llama3.2", "Revision": "main", - "Precision": "float32", - "Params": 30.0, + "Precision": "bfloat16", + "Params": 1.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1172,57 +1682,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.0725, - "Completeness": 0.0637, - "Conciseness": 0.0228, - "Helpfulness": 0.0483, - "Honesty": 0.0556, - "Harmlessness": 0.0713, - "3C3H Score": 0.0557 + "Correctness": 0.2468, + "Completeness": 0.2271, + "Conciseness": 0.1657, + "Helpfulness": 0.204, + "Honesty": 0.2335, + "Harmlessness": 0.2424, + "3C3H Score": 0.2199 }, "Tasks Scores": { - "Question Answering (QA)": 0.046, + "Question Answering (QA)": 0.0782, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.174, - "Reasoning": 0.0399 + "Safety": 0.9021, + "Reasoning": 0.3274 } }, "Meta": { - "Model Name": "inceptionai/jais-family-590m-chat", - "License": "apache-2.0", + "Model Name": "meta-llama/Llama-3.2-3B-Instruct", + "License": "llama3.2", "Revision": "main", - "Precision": "float32", - "Params": 0.719, + "Precision": "bfloat16", + "Params": 3.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2275, - "Completeness": 0.1961, - "Conciseness": 0.0995, - "Helpfulness": 0.2029, - "Honesty": 0.2078, - "Harmlessness": 0.2238, - "3C3H Score": 0.1929 + "Correctness": 0.448, + "Completeness": 0.3725, + "Conciseness": 0.3586, + "Helpfulness": 0.3939, + "Honesty": 0.4402, + "Harmlessness": 0.4478, + "3C3H Score": 0.4102 }, "Tasks Scores": { - "Question Answering (QA)": 0.1413, + "Question Answering (QA)": 0.2719, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.6208, - "Reasoning": 0.1786 + "Safety": 0.8792, + "Reasoning": 0.7131 } }, "Meta": { - "Model Name": "inceptionai/jais-family-6p7b-chat", - "License": "apache-2.0", + "Model Name": "meta-llama/Llama-3.3-70B-Instruct", + "License": "llama3.3", "Revision": "main", - "Precision": "float32", - "Params": 7.0, + "Precision": "bfloat16", + "Params": 70.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1232,27 +1742,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.0029, - "Completeness": 0.0029, - "Conciseness": 0.0, - "Helpfulness": 0.0007, - "Honesty": 0.0029, - "Harmlessness": 0.0029, - "3C3H Score": 0.0021 + "Correctness": 0.0686, + "Completeness": 0.0657, + "Conciseness": 0.036, + "Helpfulness": 0.0615, + "Honesty": 0.0662, + "Harmlessness": 0.0684, + "3C3H Score": 0.0611 }, "Tasks Scores": { - "Question Answering (QA)": 0.0035, + "Question Answering (QA)": 0.044, "Orthographic and Grammatical Analysis": 0.0, "Safety": 0.0, - "Reasoning": 0.0 + "Reasoning": 0.1708 } }, "Meta": { - "Model Name": "kyutai/helium-1-preview-2b", - "License": "cc-by-4.0", + "Model Name": "meta-llama/Meta-Llama-3-70B-Instruct", + "License": "llama3", "Revision": "main", "Precision": "bfloat16", - "Params": 2.0, + "Params": 70.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1262,27 +1772,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4029, - "Completeness": 0.3804, - "Conciseness": 0.1877, - "Helpfulness": 0.3748, - "Honesty": 0.3882, - "Harmlessness": 0.3983, - "3C3H Score": 0.3554 + "Correctness": 0.0294, + "Completeness": 0.0294, + "Conciseness": 0.0127, + "Helpfulness": 0.026, + "Honesty": 0.0272, + "Harmlessness": 0.0294, + "3C3H Score": 0.0257 }, "Tasks Scores": { - "Question Answering (QA)": 0.1775, + "Question Answering (QA)": 0.0299, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7729, - "Reasoning": 0.7774 + "Safety": 0.0, + "Reasoning": 0.0393 } }, "Meta": { - "Model Name": "maldv/Qwentile2.5-32B-Instruct", - "License": "Open", + "Model Name": "meta-llama/Meta-Llama-3-8B-Instruct", + "License": "llama3", "Revision": "main", - "Precision": "float16", - "Params": 32.0, + "Precision": "bfloat16", + "Params": 14.963, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1292,27 +1802,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3598, - "Completeness": 0.3029, - "Conciseness": 0.2534, - "Helpfulness": 0.3287, - "Honesty": 0.3495, - "Harmlessness": 0.3588, - "3C3H Score": 0.3255 + "Correctness": 0.2667, + "Completeness": 0.2549, + "Conciseness": 0.1257, + "Helpfulness": 0.2368, + "Honesty": 0.2507, + "Harmlessness": 0.2659, + "3C3H Score": 0.2335 }, "Tasks Scores": { - "Question Answering (QA)": 0.2192, + "Question Answering (QA)": 0.1294, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.8729, - "Reasoning": 0.456 + "Safety": 0.5042, + "Reasoning": 0.4762 } }, "Meta": { - "Model Name": "gpt-3.5-turbo-0125", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "mistralai/Ministral-8B-Instruct-2410", + "License": "mrl", + "Revision": "main", + "Precision": "bfloat16", + "Params": 8.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1322,117 +1832,117 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4876, - "Completeness": 0.4748, - "Conciseness": 0.202, - "Helpfulness": 0.4696, - "Honesty": 0.4716, - "Harmlessness": 0.4874, - "3C3H Score": 0.4322 + "Correctness": 0.0039, + "Completeness": 0.0039, + "Conciseness": 0.0007, + "Helpfulness": 0.0022, + "Honesty": 0.0032, + "Harmlessness": 0.0039, + "3C3H Score": 0.003 }, "Tasks Scores": { - "Question Answering (QA)": 0.2962, + "Question Answering (QA)": 0.0051, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.75, - "Reasoning": 0.8185 + "Safety": 0.0, + "Reasoning": 0.0 } }, "Meta": { - "Model Name": "rombodawg/Rombos-LLM-V2.5-Qwen-72b", - "License": "qwen", + "Model Name": "mistralai/Mistral-7B-Instruct-v0.2", + "License": "apache-2.0", "Revision": "main", "Precision": "bfloat16", - "Params": 72.0, + "Params": 7.0, "Total Entries": 340, - "Successful Entries": 337, - "Failed Entries": 3, - "Success Ratio": 0.9912 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2029, - "Completeness": 0.1882, - "Conciseness": 0.1096, - "Helpfulness": 0.1772, - "Honesty": 0.1941, - "Harmlessness": 0.2007, - "3C3H Score": 0.1788 + "Correctness": 0.1003, + "Completeness": 0.0826, + "Conciseness": 0.0258, + "Helpfulness": 0.0597, + "Honesty": 0.0774, + "Harmlessness": 0.0966, + "3C3H Score": 0.0737 }, "Tasks Scores": { - "Question Answering (QA)": 0.0802, + "Question Answering (QA)": 0.0431, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7886, - "Reasoning": 0.1887 + "Safety": 0.1646, + "Reasoning": 0.1405 } }, "Meta": { - "Model Name": "silma-ai/SILMA-Kashif-2B-Instruct-v1.0", - "License": "Gemma", + "Model Name": "mistralai/Mistral-7B-Instruct-v0.3", + "License": "apache-2.0", "Revision": "main", "Precision": "bfloat16", - "Params": 2.453, + "Params": 7.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.1082, - "Completeness": 0.0442, - "Conciseness": 0.0039, - "Helpfulness": 0.0263, - "Honesty": 0.0624, - "Harmlessness": 0.101, - "3C3H Score": 0.0577 + "Correctness": 0.501, + "Completeness": 0.4794, + "Conciseness": 0.2424, + "Helpfulness": 0.4797, + "Honesty": 0.4875, + "Harmlessness": 0.501, + "3C3H Score": 0.4485 }, "Tasks Scores": { - "Question Answering (QA)": 0.0882, - "Orthographic and Grammatical Analysis": 0.0125, - "Safety": 0.0, - "Reasoning": 0.022 + "Question Answering (QA)": 0.3437, + "Orthographic and Grammatical Analysis": 0.0514, + "Safety": 0.7979, + "Reasoning": 0.7185 } }, "Meta": { - "Model Name": "stabilityai/ar-stablelm-2-chat", - "License": "other", + "Model Name": "mistralai/Mistral-Large-Instruct-2411", + "License": "mrl", "Revision": "main", - "Precision": "float32", - "Params": 2.0, + "Precision": "bfloat16", + "Params": 123.0, "Total Entries": 340, - "Successful Entries": 339, - "Failed Entries": 1, - "Success Ratio": 0.9971 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3431, - "Completeness": 0.2892, - "Conciseness": 0.1588, - "Helpfulness": 0.288, - "Honesty": 0.3208, - "Harmlessness": 0.3431, - "3C3H Score": 0.2905 + "Correctness": 0.3598, + "Completeness": 0.3029, + "Conciseness": 0.2534, + "Helpfulness": 0.3287, + "Honesty": 0.3495, + "Harmlessness": 0.3588, + "3C3H Score": 0.3255 }, "Tasks Scores": { - "Question Answering (QA)": 0.2097, + "Question Answering (QA)": 0.2192, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.8677, - "Reasoning": 0.3161 + "Safety": 0.8729, + "Reasoning": 0.456 } }, "Meta": { - "Model Name": "utter-project/EuroLLM-9B-Instruct", - "License": "apache-2.0", - "Revision": "main", - "Precision": "bfloat16", - "Params": 9.0, + "Model Name": "gpt-3.5-turbo-0125", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1442,27 +1952,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2363, - "Completeness": 0.2255, - "Conciseness": 0.1157, - "Helpfulness": 0.2238, - "Honesty": 0.2299, - "Harmlessness": 0.2363, - "3C3H Score": 0.2112 + "Correctness": 0.6314, + "Completeness": 0.5667, + "Conciseness": 0.3995, + "Helpfulness": 0.5966, + "Honesty": 0.6179, + "Harmlessness": 0.6306, + "3C3H Score": 0.5738 }, "Tasks Scores": { - "Question Answering (QA)": 0.1266, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.4261, - "Reasoning": 0.4208 + "Question Answering (QA)": 0.4704, + "Orthographic and Grammatical Analysis": 0.2306, + "Safety": 0.9021, + "Reasoning": 0.8286 } }, "Meta": { - "Model Name": "CohereForAI/c4ai-command-r7b-12-2024", - "License": "cc-by-nc-4.0", - "Revision": "main", - "Precision": "bfloat16", - "Params": 8.0, + "Model Name": "gpt-4o-2024-08-06", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1472,27 +1982,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3206, - "Completeness": 0.3147, - "Conciseness": 0.1387, - "Helpfulness": 0.3103, - "Honesty": 0.3096, - "Harmlessness": 0.3199, - "3C3H Score": 0.2856 + "Correctness": 0.451, + "Completeness": 0.4088, + "Conciseness": 0.276, + "Helpfulness": 0.4206, + "Honesty": 0.4358, + "Harmlessness": 0.4451, + "3C3H Score": 0.4062 }, "Tasks Scores": { - "Question Answering (QA)": 0.1514, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.6552, - "Reasoning": 0.5804 + "Question Answering (QA)": 0.2562, + "Orthographic and Grammatical Analysis": 0.0361, + "Safety": 0.8677, + "Reasoning": 0.7298 } }, "Meta": { - "Model Name": "CohereForAI/c4ai-command-r7b-arabic-02-2025", - "License": "cc-by-nc-4.0", - "Revision": "main", - "Precision": "bfloat16", - "Params": 8.0, + "Model Name": "gpt-4o-mini-2024-07-18", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1502,27 +2012,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.1765, - "Completeness": 0.0931, - "Conciseness": 0.1333, - "Helpfulness": 0.1201, - "Honesty": 0.1681, - "Harmlessness": 0.175, - "3C3H Score": 0.1444 + "Correctness": 0.7588, + "Completeness": 0.7098, + "Conciseness": 0.5125, + "Helpfulness": 0.7255, + "Honesty": 0.7525, + "Harmlessness": 0.7559, + "3C3H Score": 0.7025 }, "Tasks Scores": { - "Question Answering (QA)": 0.1533, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.3083, - "Reasoning": 0.0869 + "Question Answering (QA)": 0.6051, + "Orthographic and Grammatical Analysis": 0.4528, + "Safety": 0.9437, + "Reasoning": 0.95 } }, "Meta": { - "Model Name": "FreedomIntelligence/AceGPT-v1.5-13B-Chat", - "License": "apache-2.0", - "Revision": "main", - "Precision": "float32", - "Params": 13.0, + "Model Name": "o1-2024-12-17", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1532,27 +2042,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3598, - "Completeness": 0.2961, - "Conciseness": 0.2625, - "Helpfulness": 0.3208, - "Honesty": 0.3532, - "Harmlessness": 0.3591, - "3C3H Score": 0.3252 + "Correctness": 0.4755, + "Completeness": 0.4676, + "Conciseness": 0.2804, + "Helpfulness": 0.4627, + "Honesty": 0.4667, + "Harmlessness": 0.474, + "3C3H Score": 0.4378 }, "Tasks Scores": { - "Question Answering (QA)": 0.1946, - "Orthographic and Grammatical Analysis": 0.0333, - "Safety": 0.9083, - "Reasoning": 0.4905 + "Question Answering (QA)": 0.2435, + "Orthographic and Grammatical Analysis": 0.0292, + "Safety": 0.8958, + "Reasoning": 0.9065 } }, "Meta": { - "Model Name": "FreedomIntelligence/AceGPT-v2-32B-Chat", - "License": "apache-2.0", - "Revision": "main", - "Precision": "float16", - "Params": 32.0, + "Model Name": "o1-mini-2024-09-12", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1562,27 +2072,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4343, - "Completeness": 0.3235, - "Conciseness": 0.3216, - "Helpfulness": 0.3755, - "Honesty": 0.424, - "Harmlessness": 0.4336, - "3C3H Score": 0.3854 + "Correctness": 0.5716, + "Completeness": 0.5392, + "Conciseness": 0.3689, + "Helpfulness": 0.5446, + "Honesty": 0.5608, + "Harmlessness": 0.5708, + "3C3H Score": 0.526 }, "Tasks Scores": { - "Question Answering (QA)": 0.3131, - "Orthographic and Grammatical Analysis": 0.025, - "Safety": 0.8875, - "Reasoning": 0.4595 + "Question Answering (QA)": 0.3575, + "Orthographic and Grammatical Analysis": 0.0889, + "Safety": 0.9469, + "Reasoning": 0.9542 } }, "Meta": { - "Model Name": "FreedomIntelligence/AceGPT-v2-70B-Chat", - "License": "apache-2.0", - "Revision": "main", - "Precision": "float16", - "Params": 70.0, + "Model Name": "o3-mini-2025-01-31", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1592,57 +2102,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3275, - "Completeness": 0.3108, - "Conciseness": 0.1395, - "Helpfulness": 0.3081, - "Honesty": 0.3174, - "Harmlessness": 0.326, - "3C3H Score": 0.2882 + "Correctness": 0.4876, + "Completeness": 0.4748, + "Conciseness": 0.202, + "Helpfulness": 0.4696, + "Honesty": 0.4716, + "Harmlessness": 0.4874, + "3C3H Score": 0.4322 }, "Tasks Scores": { - "Question Answering (QA)": 0.1199, + "Question Answering (QA)": 0.2962, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7729, - "Reasoning": 0.6155 + "Safety": 0.75, + "Reasoning": 0.8185 } }, "Meta": { - "Model Name": "Qwen/Qwen2.5-7B-Instruct", - "License": "apache-2.0", + "Model Name": "rombodawg/Rombos-LLM-V2.5-Qwen-72b", + "License": "qwen", "Revision": "main", "Precision": "bfloat16", - "Params": 7.0, + "Params": 72.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 337, + "Failed Entries": 3, + "Success Ratio": 0.9912 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4098, - "Completeness": 0.3539, - "Conciseness": 0.2368, - "Helpfulness": 0.3792, - "Honesty": 0.3887, - "Harmlessness": 0.4098, - "3C3H Score": 0.363 + "Correctness": 0.2029, + "Completeness": 0.1882, + "Conciseness": 0.1096, + "Helpfulness": 0.1772, + "Honesty": 0.1941, + "Harmlessness": 0.2007, + "3C3H Score": 0.1788 }, "Tasks Scores": { - "Question Answering (QA)": 0.2707, - "Orthographic and Grammatical Analysis": 0.0514, - "Safety": 0.8927, - "Reasoning": 0.4577 + "Question Answering (QA)": 0.0802, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.7886, + "Reasoning": 0.1887 } }, "Meta": { - "Model Name": "claude-3-haiku-20240307", - "License": "Proprietary", - "Revision": "UNK", - "Precision": "UNK", - "Params": "UNK", + "Model Name": "silma-ai/SILMA-Kashif-2B-Instruct-v1.0", + "License": "Gemma", + "Revision": "main", + "Precision": "bfloat16", + "Params": 2.453, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1652,174 +2162,174 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3931, - "Completeness": 0.3765, - "Conciseness": 0.211, - "Helpfulness": 0.377, - "Honesty": 0.3843, - "Harmlessness": 0.3931, - "3C3H Score": 0.3558 + "Correctness": 0.1082, + "Completeness": 0.0442, + "Conciseness": 0.0039, + "Helpfulness": 0.0263, + "Honesty": 0.0624, + "Harmlessness": 0.101, + "3C3H Score": 0.0577 }, "Tasks Scores": { - "Question Answering (QA)": 0.2201, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.8865, - "Reasoning": 0.5929 + "Question Answering (QA)": 0.0882, + "Orthographic and Grammatical Analysis": 0.0125, + "Safety": 0.0, + "Reasoning": 0.022 } }, "Meta": { - "Model Name": "google/gemma-2-27b-it", - "License": "gemma", + "Model Name": "stabilityai/ar-stablelm-2-chat", + "License": "other", "Revision": "main", - "Precision": "bfloat16", - "Params": 27.0, + "Precision": "float32", + "Params": 2.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3343, - "Completeness": 0.3196, - "Conciseness": 0.1861, - "Helpfulness": 0.323, - "Honesty": 0.3294, - "Harmlessness": 0.3336, - "3C3H Score": 0.3043 + "Correctness": 0.3431, + "Completeness": 0.2892, + "Conciseness": 0.1588, + "Helpfulness": 0.288, + "Honesty": 0.3208, + "Harmlessness": 0.3431, + "3C3H Score": 0.2905 }, "Tasks Scores": { - "Question Answering (QA)": 0.1633, + "Question Answering (QA)": 0.2097, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.8875, - "Reasoning": 0.5072 + "Safety": 0.8677, + "Reasoning": 0.3161 } }, "Meta": { - "Model Name": "google/gemma-2-9b-it", - "License": "gemma", + "Model Name": "utter-project/EuroLLM-9B-Instruct", + "License": "apache-2.0", "Revision": "main", "Precision": "bfloat16", "Params": 9.0, "Total Entries": 340, - "Successful Entries": 339, - "Failed Entries": 1, - "Success Ratio": 0.9971 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4888, - "Completeness": 0.4792, - "Conciseness": 0.1976, - "Helpfulness": 0.4662, - "Honesty": 0.4702, - "Harmlessness": 0.488, - "3C3H Score": 0.4317 + "Correctness": 0.1961, + "Completeness": 0.1529, + "Conciseness": 0.1456, + "Helpfulness": 0.1578, + "Honesty": 0.1887, + "Harmlessness": 0.1946, + "3C3H Score": 0.1726 }, "Tasks Scores": { - "Question Answering (QA)": 0.2443, + "Question Answering (QA)": 0.1039, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7927, - "Reasoning": 0.8 + "Safety": 0.7333, + "Reasoning": 0.1226 } }, "Meta": { - "Model Name": "google/gemma-3-12b-it", - "License": "gemma", + "Model Name": "inceptionai/jais-adapted-7b-chat", + "License": "apache-2.0", "Revision": "main", - "Precision": "bfloat16", - "Params": 12.0, + "Precision": "float32", + "Params": 7.0, "Total Entries": 340, - "Successful Entries": 313, - "Failed Entries": 27, - "Success Ratio": 0.9206 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2101, - "Completeness": 0.2041, - "Conciseness": 0.0466, - "Helpfulness": 0.1834, - "Honesty": 0.1997, - "Harmlessness": 0.2034, - "3C3H Score": 0.1746 + "Correctness": 0.1618, + "Completeness": 0.1588, + "Conciseness": 0.0265, + "Helpfulness": 0.1287, + "Honesty": 0.1544, + "Harmlessness": 0.1618, + "3C3H Score": 0.132 }, "Tasks Scores": { - "Question Answering (QA)": 0.0694, + "Question Answering (QA)": 0.0381, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.7292, - "Reasoning": 0.2298 + "Safety": 0.1052, + "Reasoning": 0.472 } }, "Meta": { - "Model Name": "google/gemma-3-1b-it", - "License": "gemma", + "Model Name": "Qwen/Qwen3-0.6B", + "License": "apache-2.0", "Revision": "main", "Precision": "bfloat16", - "Params": 1.0, + "Params": 0.707, "Total Entries": 340, - "Successful Entries": 338, - "Failed Entries": 2, - "Success Ratio": 0.9941 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.5231, - "Completeness": 0.5064, - "Conciseness": 0.1868, - "Helpfulness": 0.4939, - "Honesty": 0.5044, - "Harmlessness": 0.5172, - "3C3H Score": 0.4553 + "Correctness": 0.2284, + "Completeness": 0.2225, + "Conciseness": 0.0199, + "Helpfulness": 0.173, + "Honesty": 0.2223, + "Harmlessness": 0.227, + "3C3H Score": 0.1822 }, "Tasks Scores": { - "Question Answering (QA)": 0.3213, - "Orthographic and Grammatical Analysis": 0.0292, - "Safety": 0.7724, - "Reasoning": 0.8441 + "Question Answering (QA)": 0.0764, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.2062, + "Reasoning": 0.5488 } }, "Meta": { - "Model Name": "google/gemma-3-27b-it", - "License": "gemma", + "Model Name": "Qwen/Qwen3-1.7B", + "License": "apache-2.0", "Revision": "main", "Precision": "bfloat16", - "Params": 27.0, + "Params": 1.9, "Total Entries": 340, - "Successful Entries": 339, - "Failed Entries": 1, - "Success Ratio": 0.9971 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3392, - "Completeness": 0.3363, - "Conciseness": 0.1088, - "Helpfulness": 0.3186, - "Honesty": 0.3316, - "Harmlessness": 0.337, - "3C3H Score": 0.2953 + "Correctness": 0.3333, + "Completeness": 0.3245, + "Conciseness": 0.0338, + "Helpfulness": 0.2676, + "Honesty": 0.3235, + "Harmlessness": 0.3267, + "3C3H Score": 0.2683 }, "Tasks Scores": { - "Question Answering (QA)": 0.1067, + "Question Answering (QA)": 0.1233, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.8229, - "Reasoning": 0.6589 + "Safety": 0.6156, + "Reasoning": 0.5988 } }, "Meta": { - "Model Name": "google/gemma-3-4b-it", - "License": "gemma", + "Model Name": "Qwen/Qwen3-4B", + "License": "apache-2.0", "Revision": "main", "Precision": "bfloat16", "Params": 4.0, @@ -1832,87 +2342,117 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.1667, - "Completeness": 0.1627, - "Conciseness": 0.0603, - "Helpfulness": 0.1392, - "Honesty": 0.1439, - "Harmlessness": 0.1615, - "3C3H Score": 0.1391 + "Correctness": 0.3561, + "Completeness": 0.3501, + "Conciseness": 0.0277, + "Helpfulness": 0.2933, + "Honesty": 0.3427, + "Harmlessness": 0.3539, + "3C3H Score": 0.2873 }, "Tasks Scores": { - "Question Answering (QA)": 0.0885, + "Question Answering (QA)": 0.1613, "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.3938, - "Reasoning": 0.1976 + "Safety": 0.5969, + "Reasoning": 0.5881 } }, "Meta": { - "Model Name": "inceptionai/jais-family-1p3b-chat", + "Model Name": "Qwen/Qwen3-8B", "License": "apache-2.0", "Revision": "main", - "Precision": "float32", - "Params": 1.0, + "Precision": "bfloat16", + "Params": 8.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 337, + "Failed Entries": 3, + "Success Ratio": 0.9912 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.3931, - "Completeness": 0.3441, - "Conciseness": 0.2596, - "Helpfulness": 0.361, - "Honesty": 0.3784, - "Harmlessness": 0.3895, - "3C3H Score": 0.3543 + "Correctness": 0.4277, + "Completeness": 0.413, + "Conciseness": 0.0514, + "Helpfulness": 0.3668, + "Honesty": 0.4135, + "Harmlessness": 0.4277, + "3C3H Score": 0.35 + }, + "Tasks Scores": { + "Question Answering (QA)": 0.2206, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.6906, + "Reasoning": 0.6732 + } + }, + "Meta": { + "Model Name": "Qwen/Qwen3-14B", + "License": "apache-2.0", + "Revision": "main", + "Precision": "bfloat16", + "Params": 14.0, + "Total Entries": 340, + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 + } + }, + { + "claude-3.5-sonnet Scores": { + "3C3H Scores": { + "Correctness": 0.4228, + "Completeness": 0.414, + "Conciseness": 0.0696, + "Helpfulness": 0.3727, + "Honesty": 0.409, + "Harmlessness": 0.4191, + "3C3H Score": 0.3512 }, "Tasks Scores": { - "Question Answering (QA)": 0.2044, - "Orthographic and Grammatical Analysis": 0.0333, - "Safety": 0.8719, - "Reasoning": 0.6244 + "Question Answering (QA)": 0.2262, + "Orthographic and Grammatical Analysis": 0.0292, + "Safety": 0.6885, + "Reasoning": 0.6518 } }, "Meta": { - "Model Name": "malhajar/Shahin-v0.1", - "License": "Open", + "Model Name": "Qwen/Qwen3-32B", + "License": "apache-2.0", "Revision": "main", - "Precision": "float16", - "Params": 27.519, + "Precision": "bfloat16", + "Params": 32.0, "Total Entries": 340, - "Successful Entries": 340, - "Failed Entries": 0, - "Success Ratio": 1.0 + "Successful Entries": 339, + "Failed Entries": 1, + "Success Ratio": 0.9971 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.4225, - "Completeness": 0.3569, - "Conciseness": 0.3252, - "Helpfulness": 0.3777, - "Honesty": 0.4147, - "Harmlessness": 0.4218, - "3C3H Score": 0.3865 + "Correctness": 0.4147, + "Completeness": 0.4118, + "Conciseness": 0.0549, + "Helpfulness": 0.3542, + "Honesty": 0.4051, + "Harmlessness": 0.4132, + "3C3H Score": 0.3423 }, "Tasks Scores": { - "Question Answering (QA)": 0.2353, - "Orthographic and Grammatical Analysis": 0.025, - "Safety": 0.8542, - "Reasoning": 0.706 + "Question Answering (QA)": 0.2217, + "Orthographic and Grammatical Analysis": 0.0, + "Safety": 0.6687, + "Reasoning": 0.647 } }, "Meta": { - "Model Name": "meta-llama/Llama-3.1-70B-Instruct", - "License": "llama3.1", + "Model Name": "Qwen/Qwen3-30B-A3B", + "License": "apache-2.0", "Revision": "main", "Precision": "bfloat16", - "Params": 70.0, + "Params": 30.0, "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1922,27 +2462,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2971, - "Completeness": 0.2686, - "Conciseness": 0.1968, - "Helpfulness": 0.261, - "Honesty": 0.2814, - "Harmlessness": 0.2971, - "3C3H Score": 0.267 + "Correctness": 0.6284, + "Completeness": 0.6049, + "Conciseness": 0.2762, + "Helpfulness": 0.6103, + "Honesty": 0.61, + "Harmlessness": 0.6277, + "3C3H Score": 0.5596 }, "Tasks Scores": { - "Question Answering (QA)": 0.1176, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.8792, - "Reasoning": 0.4583 + "Question Answering (QA)": 0.4199, + "Orthographic and Grammatical Analysis": 0.3458, + "Safety": 0.8688, + "Reasoning": 0.8738 } }, "Meta": { - "Model Name": "meta-llama/Llama-3.1-8B-Instruct", - "License": "llama3.1", - "Revision": "main", - "Precision": "bfloat16", - "Params": 8.0, + "Model Name": "claude-sonnet-4-20250514", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1952,27 +2492,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.1353, - "Completeness": 0.1176, - "Conciseness": 0.0875, - "Helpfulness": 0.1007, - "Honesty": 0.1213, - "Harmlessness": 0.1301, - "3C3H Score": 0.1154 + "Correctness": 0.6804, + "Completeness": 0.6627, + "Conciseness": 0.3127, + "Helpfulness": 0.6608, + "Honesty": 0.6657, + "Harmlessness": 0.6782, + "3C3H Score": 0.6101 }, "Tasks Scores": { - "Question Answering (QA)": 0.0479, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.5875, - "Reasoning": 0.0881 + "Question Answering (QA)": 0.4672, + "Orthographic and Grammatical Analysis": 0.5764, + "Safety": 0.9156, + "Reasoning": 0.8583 } }, "Meta": { - "Model Name": "meta-llama/Llama-3.2-1B-Instruct", - "License": "llama3.2", - "Revision": "main", - "Precision": "bfloat16", - "Params": 1.0, + "Model Name": "claude-opus-4-20250514", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -1982,57 +2522,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2468, - "Completeness": 0.2271, - "Conciseness": 0.1657, - "Helpfulness": 0.204, - "Honesty": 0.2335, - "Harmlessness": 0.2424, - "3C3H Score": 0.2199 + "Correctness": 0.7392, + "Completeness": 0.7157, + "Conciseness": 0.4118, + "Helpfulness": 0.7194, + "Honesty": 0.724, + "Harmlessness": 0.7355, + "3C3H Score": 0.6743 }, "Tasks Scores": { - "Question Answering (QA)": 0.0782, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.9021, - "Reasoning": 0.3274 + "Question Answering (QA)": 0.5904, + "Orthographic and Grammatical Analysis": 0.3917, + "Safety": 0.9563, + "Reasoning": 0.8738 } }, "Meta": { - "Model Name": "meta-llama/Llama-3.2-3B-Instruct", - "License": "llama3.2", - "Revision": "main", - "Precision": "bfloat16", - "Params": 3.0, + "Model Name": "gpt-4.5-preview-2025-02-27", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, - "Successful Entries": 339, - "Failed Entries": 1, - "Success Ratio": 0.9971 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.448, - "Completeness": 0.3725, - "Conciseness": 0.3586, - "Helpfulness": 0.3939, - "Honesty": 0.4402, - "Harmlessness": 0.4478, - "3C3H Score": 0.4102 + "Correctness": 0.7529, + "Completeness": 0.7069, + "Conciseness": 0.4877, + "Helpfulness": 0.7262, + "Honesty": 0.7517, + "Harmlessness": 0.7529, + "3C3H Score": 0.6964 }, "Tasks Scores": { - "Question Answering (QA)": 0.2719, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.8792, - "Reasoning": 0.7131 + "Question Answering (QA)": 0.5853, + "Orthographic and Grammatical Analysis": 0.5264, + "Safety": 0.9677, + "Reasoning": 0.9315 } }, "Meta": { - "Model Name": "meta-llama/Llama-3.3-70B-Instruct", - "License": "llama3.3", - "Revision": "main", - "Precision": "bfloat16", - "Params": 70.0, + "Model Name": "o3-2025-04-16", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -2042,27 +2582,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.0686, - "Completeness": 0.0657, - "Conciseness": 0.036, - "Helpfulness": 0.0615, - "Honesty": 0.0662, - "Harmlessness": 0.0684, - "3C3H Score": 0.0611 + "Correctness": 0.6873, + "Completeness": 0.6569, + "Conciseness": 0.3603, + "Helpfulness": 0.6659, + "Honesty": 0.676, + "Harmlessness": 0.6865, + "3C3H Score": 0.6221 }, "Tasks Scores": { - "Question Answering (QA)": 0.044, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.0, - "Reasoning": 0.1708 + "Question Answering (QA)": 0.5149, + "Orthographic and Grammatical Analysis": 0.35, + "Safety": 0.9094, + "Reasoning": 0.881 } }, "Meta": { - "Model Name": "meta-llama/Meta-Llama-3-70B-Instruct", - "License": "llama3", - "Revision": "main", - "Precision": "bfloat16", - "Params": 70.0, + "Model Name": "gpt-4.1-2025-04-14", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -2072,27 +2612,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.0294, - "Completeness": 0.0294, - "Conciseness": 0.0127, - "Helpfulness": 0.026, - "Honesty": 0.0272, - "Harmlessness": 0.0294, - "3C3H Score": 0.0257 + "Correctness": 0.501, + "Completeness": 0.4676, + "Conciseness": 0.3059, + "Helpfulness": 0.4824, + "Honesty": 0.4892, + "Harmlessness": 0.498, + "3C3H Score": 0.4574 }, "Tasks Scores": { - "Question Answering (QA)": 0.0299, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.0, - "Reasoning": 0.0393 + "Question Answering (QA)": 0.2823, + "Orthographic and Grammatical Analysis": 0.1042, + "Safety": 0.8396, + "Reasoning": 0.8905 } }, "Meta": { - "Model Name": "meta-llama/Meta-Llama-3-8B-Instruct", - "License": "llama3", - "Revision": "main", - "Precision": "bfloat16", - "Params": 14.963, + "Model Name": "gpt-4.1-mini-2025-04-14", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -2102,27 +2642,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.2667, - "Completeness": 0.2549, - "Conciseness": 0.1257, - "Helpfulness": 0.2368, - "Honesty": 0.2507, - "Harmlessness": 0.2659, - "3C3H Score": 0.2335 + "Correctness": 0.4373, + "Completeness": 0.4176, + "Conciseness": 0.2706, + "Helpfulness": 0.4196, + "Honesty": 0.4299, + "Harmlessness": 0.4373, + "3C3H Score": 0.402 }, "Tasks Scores": { - "Question Answering (QA)": 0.1294, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.5042, - "Reasoning": 0.4762 + "Question Answering (QA)": 0.2383, + "Orthographic and Grammatical Analysis": 0.0306, + "Safety": 0.8875, + "Reasoning": 0.7518 } }, "Meta": { - "Model Name": "mistralai/Ministral-8B-Instruct-2410", - "License": "mrl", - "Revision": "main", - "Precision": "bfloat16", - "Params": 8.0, + "Model Name": "gpt-4.1-nano-2025-04-14", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -2132,27 +2672,27 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.0039, - "Completeness": 0.0039, - "Conciseness": 0.0007, - "Helpfulness": 0.0022, - "Honesty": 0.0032, - "Harmlessness": 0.0039, - "3C3H Score": 0.003 + "Correctness": 0.6196, + "Completeness": 0.5912, + "Conciseness": 0.4471, + "Helpfulness": 0.589, + "Honesty": 0.6172, + "Harmlessness": 0.6189, + "3C3H Score": 0.5805 }, "Tasks Scores": { - "Question Answering (QA)": 0.0051, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.0, - "Reasoning": 0.0 + "Question Answering (QA)": 0.4543, + "Orthographic and Grammatical Analysis": 0.0819, + "Safety": 0.9469, + "Reasoning": 0.9452 } }, "Meta": { - "Model Name": "mistralai/Mistral-7B-Instruct-v0.2", - "License": "apache-2.0", - "Revision": "main", - "Precision": "bfloat16", - "Params": 7.0, + "Model Name": "o4-mini-2025-04-16", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -2162,57 +2702,57 @@ { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.1003, - "Completeness": 0.0826, - "Conciseness": 0.0258, - "Helpfulness": 0.0597, - "Honesty": 0.0774, - "Harmlessness": 0.0966, - "3C3H Score": 0.0737 + "Correctness": 0.601, + "Completeness": 0.5676, + "Conciseness": 0.261, + "Helpfulness": 0.5789, + "Honesty": 0.5956, + "Harmlessness": 0.5995, + "3C3H Score": 0.5339 }, "Tasks Scores": { - "Question Answering (QA)": 0.0431, - "Orthographic and Grammatical Analysis": 0.0, - "Safety": 0.1646, - "Reasoning": 0.1405 + "Question Answering (QA)": 0.4206, + "Orthographic and Grammatical Analysis": 0.3111, + "Safety": 0.8698, + "Reasoning": 0.7613 } }, "Meta": { - "Model Name": "mistralai/Mistral-7B-Instruct-v0.3", - "License": "apache-2.0", - "Revision": "main", - "Precision": "bfloat16", - "Params": 7.0, + "Model Name": "gemini-2.5-flash-preview-05-20", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, - "Successful Entries": 339, - "Failed Entries": 1, - "Success Ratio": 0.9971 + "Successful Entries": 340, + "Failed Entries": 0, + "Success Ratio": 1.0 } }, { "claude-3.5-sonnet Scores": { "3C3H Scores": { - "Correctness": 0.501, - "Completeness": 0.4794, - "Conciseness": 0.2424, - "Helpfulness": 0.4797, - "Honesty": 0.4875, - "Harmlessness": 0.501, - "3C3H Score": 0.4485 + "Correctness": 0.6471, + "Completeness": 0.6255, + "Conciseness": 0.3061, + "Helpfulness": 0.6245, + "Honesty": 0.6365, + "Harmlessness": 0.6456, + "3C3H Score": 0.5809 }, "Tasks Scores": { - "Question Answering (QA)": 0.3437, - "Orthographic and Grammatical Analysis": 0.0514, - "Safety": 0.7979, - "Reasoning": 0.7185 + "Question Answering (QA)": 0.5298, + "Orthographic and Grammatical Analysis": 0.3403, + "Safety": 0.8771, + "Reasoning": 0.6607 } }, "Meta": { - "Model Name": "mistralai/Mistral-Large-Instruct-2411", - "License": "mrl", - "Revision": "main", - "Precision": "bfloat16", - "Params": 123.0, + "Model Name": "gemini-2.5-pro-preview-05-06", + "License": "Proprietary", + "Revision": "UNK", + "Precision": "UNK", + "Params": "UNK", "Total Entries": 340, "Successful Entries": 340, "Failed Entries": 0, @@ -2220,6 +2760,6 @@ } }, { - "_last_sync_timestamp": "2025-03-23T12:44:33.422103" + "_last_sync_timestamp": "2025-05-30T12:44:33.422103" } ] \ No newline at end of file