Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Yunxiang
commited on
Commit
·
859c0e2
1
Parent(s):
aae1bdc
version 1
Browse files- app.py +12 -0
- data/merged_leaderboards.csv +64 -0
- requirements.txt +4 -0
- src/data_processing.py +154 -0
- src/ui.py +734 -0
app.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
# Add project root to path so we can import modules
|
| 5 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 6 |
+
|
| 7 |
+
from src.ui import create_leaderboard_ui
|
| 8 |
+
|
| 9 |
+
if __name__ == "__main__":
|
| 10 |
+
# Create and launch the UI
|
| 11 |
+
app = create_leaderboard_ui()
|
| 12 |
+
app.launch()
|
data/merged_leaderboards.csv
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model Name (LiveBench),Organization,Global Average,Reasoning Average,Coding Average,Mathematics Average,Data Analysis Average,Language Average,IF Average,Model Link (LiveBench),Arena Rank (No Style Control),Arena Rank (With Style Control),Model Name (Arena),Arena Score,95% Confidence Interval,# of Votes,Model License,Model Knowledge Cutoff,Model Link (Arena)
|
| 2 |
+
o3-2025-04-16-high,OpenAI,81.55,93.33,73.33,84.67,75.8,76.0,86.17,https://openai.com/index/introducing-o3-and-o4-mini/,,,,,,,,,
|
| 3 |
+
o3-2025-04-16-medium,OpenAI,79.22,91.0,72.62,80.66,73.21,73.48,84.32,https://openai.com/index/introducing-o3-and-o4-mini/,,,,,,,,,
|
| 4 |
+
o4-mini-2025-04-16-high,OpenAI,78.13,88.11,74.33,84.9,70.43,66.05,84.96,https://openai.com/index/introducing-o3-and-o4-mini/,,,,,,,,,
|
| 5 |
+
gemini-2.5-pro-exp-03-25,Google,77.43,87.53,58.09,89.16,79.89,69.31,80.59,https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/,1.0,1.0,Gemini-2.5-Pro-Exp-03-25,1439.0,+7/-5,9013.0,Proprietary,Unknown,http://aistudio.google.com/app/prompts/new_chat?model=gemini-2.5-pro-exp-03-25
|
| 6 |
+
o4-mini-2025-04-16-medium,OpenAI,72.75,78.47,61.81,81.02,70.96,62.41,81.83,https://openai.com/index/introducing-o3-and-o4-mini/,,,,,,,,,
|
| 7 |
+
o1-2024-12-17-high,OpenAI,72.18,77.47,57.14,79.28,65.47,72.15,81.55,https://openai.com/o1/,10.0,6.0,o1-2024-12-17,1350.0,+3/-3,28990.0,Proprietary,Unknown,https://openai.com/index/o1-and-new-tools-for-developers/
|
| 8 |
+
o3-mini-2025-01-31-high,OpenAI,71.37,74.36,65.48,76.55,70.64,56.86,84.36,https://openai.com/index/openai-o3-mini/,15.0,14.0,o3-mini-high,1325.0,+4/-4,17988.0,Proprietary,Unknown,https://platform.openai.com/docs/guides/reasoning#reasoning-effort
|
| 9 |
+
gemini-2.5-flash-preview-04-17,Google,71.21,73.47,58.38,81.45,75.49,59.43,79.02,https://blog.google/products/gemini/gemini-2-5-flash-preview/,2.0,4.0,Gemini-2.5-Flash-Preview-04-17,1392.0,+10/-13,3325.0,Proprietary,Unknown,http://aistudio.google.com/app/prompts/new_chat?model=gemini-2.5-flash-preview-04-17
|
| 10 |
+
claude-3-7-sonnet-20250219-thinking-64k,Anthropic,70.57,76.17,44.67,79.0,74.05,68.27,81.25,https://www.anthropic.com/news/claude-3-7-sonnet,18.0,6.0,Claude 3.7 Sonnet (thinking-32k),1303.0,+6/-7,8998.0,Proprietary,Unknown,https://www.anthropic.com/news/claude-3-7-sonnet
|
| 11 |
+
grok-3-mini-beta-high,xAI,68.33,87.61,39.71,77.0,67.87,59.09,78.7,https://x.ai/blog/grok-3,2.0,4.0,Grok-3-Preview-02-24,1402.0,+5/-3,14849.0,Proprietary,Unknown,https://x.ai/blog/grok-3
|
| 12 |
+
deepseek-r1,DeepSeek,67.47,76.58,48.48,77.91,67.6,54.77,79.49,https://huggingface.co/deepseek-ai/DeepSeek-R1,9.0,6.0,DeepSeek-R1,1358.0,+4/-5,16077.0,MIT,Unknown,https://api-docs.deepseek.com/news/news250120
|
| 13 |
+
o3-mini-2025-01-31-medium,OpenAI,67.16,69.0,58.43,71.68,66.56,54.12,83.16,https://openai.com/index/openai-o3-mini/,18.0,20.0,o3-mini,1305.0,+3/-4,24877.0,Proprietary,Unknown,https://openai.com/index/openai-o3-mini/
|
| 14 |
+
qwq-32b,Alibaba,65.69,76.72,43.0,76.08,65.03,51.48,81.83,https://qwenlm.github.io/blog/qwq-32b/,15.0,21.0,QwQ-32B,1316.0,+6/-6,7735.0,Apache 2.0,Unknown,https://huggingface.co/Qwen/QwQ-32B
|
| 15 |
+
gpt-4.5-preview-2025-02-27,OpenAI,62.13,54.42,49.0,67.94,64.33,64.76,72.33,https://openai.com/index/introducing-gpt-4-5/,2.0,2.0,GPT-4.5-Preview,1398.0,+5/-6,14520.0,Proprietary,Unknown,https://openai.com/index/introducing-gpt-4-5/
|
| 16 |
+
gemini-2.0-flash-thinking-exp-01-21,Google,62.05,61.5,35.71,74.81,69.37,48.43,82.47,https://ai.google.dev/gemini-api/docs/thinking-mode,5.0,9.0,Gemini-2.0-Flash-Thinking-Exp-01-21,1380.0,+4/-3,26309.0,Proprietary,Unknown,https://aistudio.google.com/prompts/new_chat?model=gemini-2.0-flash-thinking-exp-01-21
|
| 17 |
+
gemini-2.0-pro-exp-02-05,Google,61.59,61.75,35.33,68.54,68.02,52.5,83.38,https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/,5.0,5.0,Gemini-2.0-Pro-Exp-02-05,1380.0,+5/-4,20127.0,Proprietary,Unknown,https://aistudio.google.com/prompts/new_chat?model=gemini-2.0-pro-exp-02-05
|
| 18 |
+
o3-mini-2025-01-31-low,OpenAI,59.76,55.94,50.76,61.67,62.04,48.07,80.06,https://openai.com/index/openai-o3-mini/,,,,,,,,,
|
| 19 |
+
gpt-4.1-2025-04-14,OpenAI,58.41,44.39,42.95,62.39,69.13,54.55,77.05,https://openai.com/index/gpt-4-1/,,,,,,,,,
|
| 20 |
+
claude-3-7-sonnet-20250219-base,Anthropic,58.21,49.11,32.43,64.65,63.37,63.19,76.49,https://www.anthropic.com/news/claude-3-7-sonnet,24.0,11.0,Claude 3.7 Sonnet,1293.0,+4/-5,14287.0,Proprietary,Unknown,https://www.anthropic.com/news/claude-3-7-sonnet
|
| 21 |
+
deepseek-v3-0324,DeepSeek,57.48,44.28,40.52,71.44,60.37,46.82,81.47,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,5.0,4.0,DeepSeek-V3-0324,1372.0,+9/-6,5888.0,MIT,Unknown,https://api-docs.deepseek.com/news/news250325
|
| 22 |
+
grok-3-beta,xAI,56.95,48.53,37.33,62.75,54.53,53.8,84.74,https://x.ai/blog/grok-3,2.0,4.0,Grok-3-Preview-02-24,1402.0,+5/-3,14849.0,Proprietary,Unknown,https://x.ai/blog/grok-3
|
| 23 |
+
chatgpt-4o-latest-2025-03-27,OpenAI,55.84,48.81,38.67,55.72,70.47,49.43,71.92,https://x.com/OpenAIDevs/status/1905335104211185999?t=pmYR2_xGFyWs1xOGuNxRsw&s=19,2.0,2.0,ChatGPT-4o-latest (2025-03-26),1407.0,+6/-6,8261.0,Proprietary,Unknown,https://x.com/OpenAI/status/1905331956856050135
|
| 24 |
+
gpt-4.1-mini-2025-04-14,OpenAI,55.55,53.78,47.57,58.78,64.87,38.0,70.31,https://openai.com/index/gpt-4-1/,,,,,,,,,
|
| 25 |
+
qwen2.5-max,Alibaba,55.14,38.53,33.81,56.87,67.93,58.37,75.35,https://qwenlm.github.io/blog/qwen2.5-max/,12.0,14.0,Qwen2.5-Max,1340.0,+4/-4,21169.0,Proprietary,Unknown,https://qwenlm.github.io/blog/qwen2.5-max/
|
| 26 |
+
gemini-2.0-flash-001,Google,54.89,44.25,26.19,63.19,67.55,42.39,85.79,https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/,9.0,14.0,Gemini-2.0-Flash-001,1354.0,+3/-4,22201.0,Proprietary,Unknown,https://aistudio.google.com/app/prompts/new_chat?instructions=lmsys-1121&model=gemini-2.0-flash-001
|
| 27 |
+
deepseek-r1-distill-llama-70b,DeepSeek,54.69,59.81,46.62,58.8,55.93,37.05,69.94,https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B,,,,,,,,,
|
| 28 |
+
llama-4-maverick-17b-128e-instruct,Meta,54.38,43.83,37.43,60.58,59.03,49.65,75.75,https://ai.meta.com/blog/llama-4-multimodal-intelligence/,35.0,24.0,Llama-4-Maverick-17B-128E-Instruct,1271.0,+7/-8,4917.0,Llama 4,Unknown,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
| 29 |
+
o1-mini-2024-09-12,OpenAI,53.43,51.33,41.0,60.26,57.92,44.66,65.4,https://platform.openai.com/docs/guides/reasoning,18.0,24.0,o1-mini,1304.0,+3/-2,54960.0,Proprietary,2023/10,https://platform.openai.com/docs/models/o1
|
| 30 |
+
hunyuan-turbos-20250313,Tencent,50.83,38.22,23.24,57.47,75.49,34.46,76.13,https://cloud.tencent.com/document/product/1729/104753,17.0,19.0,Hunyuan-TurboS-20250226,1302.0,+8/-10,2452.0,Proprietary,Unknown,https://cloud.tencent.com/document/product/1729/104753
|
| 31 |
+
claude-3-5-sonnet-20241022,Anthropic,50.81,43.22,32.29,50.54,55.03,54.48,69.3,https://www.anthropic.com/news/3-5-models-and-computer-use,31.0,14.0,Claude 3.5 Sonnet (20241022),1283.0,+2/-2,64670.0,Proprietary,2024/4,https://www.anthropic.com/news/3-5-models-and-computer-use
|
| 32 |
+
step-2-16k-202411,StepFun,49.86,42.39,31.09,43.68,63.72,38.41,79.88,https://www.stepfun.com/#step2,17.0,24.0,Step-2-16K-Exp,1304.0,+8/-8,5128.0,Proprietary,Unknown,https://platform.stepfun.com/docs/llm/text
|
| 33 |
+
gpt-4o-2024-08-06,OpenAI,49.21,42.58,31.81,45.72,60.91,45.63,68.58,https://openai.com/index/hello-gpt-4o/,41.0,29.0,GPT-4o-2024-08-06,1265.0,+2/-2,47982.0,Proprietary,2023/10,https://platform.openai.com/docs/models/gpt-4o
|
| 34 |
+
deepseek-r1-distill-qwen-32b,DeepSeek,48.14,44.36,52.33,60.13,45.41,30.92,55.71,https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,,,,,,,,,
|
| 35 |
+
grok-2-1212,xAI,48.11,36.72,26.14,55.94,54.45,45.79,69.63,https://x.ai/blog/grok-1212,28.0,29.0,Grok-2-08-13,1288.0,+2/-2,67102.0,Proprietary,2024/3,https://x.ai/blog/grok-2
|
| 36 |
+
gemini-2.0-flash-lite-001,Google,47.77,32.25,23.38,54.97,65.45,33.94,76.63,https://developers.googleblog.com/en/start-building-with-the-gemini-2-0-flash-family/,16.0,20.0,Gemini-2.0-Flash-Lite,1311.0,+5/-5,22122.0,Proprietary,Unknown,https://aistudio.google.com/prompts/new_chat?model=gemini-2.0-flash-lite
|
| 37 |
+
meta-llama-3.1-405b-instruct-turbo,Meta,47.54,40.58,28.81,40.49,55.85,43.58,75.9,https://www.together.ai/blog/meta-llama-3-1,,,,,,,,,
|
| 38 |
+
learnlm-1.5-pro-experimental,Google,47.49,34.86,32.38,56.71,54.97,37.86,68.16,https://ai.google.dev/gemini-api/docs/learnlm,,,,,,,,,
|
| 39 |
+
gpt-4o-2024-11-20,OpenAI,47.03,39.75,35.19,41.48,56.15,44.68,64.94,https://openai.com/index/hello-gpt-4o/,41.0,29.0,GPT-4o-2024-08-06,1265.0,+2/-2,47982.0,Proprietary,2023/10,https://platform.openai.com/docs/models/gpt-4o
|
| 40 |
+
gemma-3-27b-it,Google,46.63,34.42,25.43,52.27,51.45,41.31,74.9,https://blog.google/technology/developers/gemma-3/,11.0,14.0,Gemma-3-27B-it,1342.0,+5/-6,9976.0,Gemma,Unknown,http://aistudio.google.com/app/prompts/new_chat?model=gemma-3-27b-it
|
| 41 |
+
llama-3.3-70b-instruct-turbo,Meta,45.68,32.53,24.05,41.4,49.49,43.97,82.67,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,51.0,42.0,Llama-3.3-70B-Instruct,1257.0,+2/-3,38101.0,Llama-3.3,Unknown,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
| 42 |
+
claude-3-opus-20240229,Anthropic,45.61,32.03,23.33,42.93,57.89,53.57,63.89,https://www.anthropic.com/claude,57.0,41.0,Claude 3 Opus,1247.0,+2/-2,202697.0,Proprietary,2023/8,https://www.anthropic.com/news/claude-3-family
|
| 43 |
+
command-a-03-2025,Cohere,45.33,36.33,20.43,45.54,50.06,36.7,82.9,https://cohere.com/blog/command-a,18.0,20.0,Command A (03-2025),1303.0,+7/-6,7547.0,CC-BY-NC-4.0,Unknown,https://cohere.com/blog/command-a
|
| 44 |
+
dracarys2-72b-instruct,AbacusAI,44.83,37.49,25.43,52.25,55.51,33.06,65.22,https://huggingface.co/abacusai/Dracarys2-72B-Instruct,,,,,,,,,
|
| 45 |
+
mistral-large-2411,Mistral AI,43.59,33.83,26.95,42.2,50.15,40.45,67.93,https://huggingface.co/mistralai/Mistral-Large-Instruct-2411,56.0,57.0,Mistral-Large-2411,1249.0,+3/-3,28985.0,MRL,Unknown,https://huggingface.co/mistralai/Mistral-Large-Instruct-2411
|
| 46 |
+
qwen2.5-72b-instruct-turbo,Alibaba,43.36,34.08,21.29,51.88,51.91,36.63,64.39,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,51.0,57.0,Qwen2.5-72B-Instruct,1257.0,+2/-3,41532.0,Qwen,2024/9,https://qwenlm.github.io/blog/qwen2.5/
|
| 47 |
+
dracarys2-llama-3.1-70b-instruct,AbacusAI,42.97,36.67,21.24,40.3,53.98,42.37,63.24,https://huggingface.co/abacusai/Dracarys2-Llama-3.1-70B-Instruct,,,,,,,,,
|
| 48 |
+
gemma-3-12b-it,Google,41.25,28.61,19.09,48.14,46.56,31.27,73.83,https://blog.google/technology/developers/gemma-3/,,,,,,,,,
|
| 49 |
+
mistral-small-2503,Mistral AI,40.92,37.08,21.29,38.39,50.54,34.59,63.66,https://mistral.ai/news/mistral-small-3-1,69.0,74.0,Mistral-Small-24B-Instruct-2501,1217.0,+5/-4,14573.0,Apache 2.0,Unknown,https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501
|
| 50 |
+
phi-4,Microsoft,40.68,39.06,29.09,43.03,45.17,29.33,58.38,https://huggingface.co/microsoft/Phi-4,77.0,84.0,Phi-4,1205.0,+4/-3,25224.0,MIT,Unknown,https://huggingface.co/microsoft/phi-4
|
| 51 |
+
meta-llama-3.1-70b-instruct-turbo,Meta,40.52,29.67,19.86,32.54,53.75,38.35,68.98,https://www.together.ai/blog/meta-llama-3-1,57.0,63.0,Meta-Llama-3.1-70B-Instruct,1248.0,+2/-2,58654.0,Llama 3.1 Community,2023/12,https://ai.meta.com/blog/meta-llama-3-1/
|
| 52 |
+
amazon.nova-pro-v1:0,Amazon,40.05,28.25,20.0,37.7,48.31,38.94,67.13,https://aws.amazon.com/ai/generative-ai/nova/,59.0,64.0,Amazon Nova Pro 1.0,1245.0,+3/-4,24285.0,Proprietary,Unknown,https://docs.aws.amazon.com/nova/latest/userguide/what-is-nova.html
|
| 53 |
+
gpt-4.1-nano-2025-04-14,OpenAI,39.72,35.58,25.29,42.39,46.59,30.96,57.54,https://openai.com/index/gpt-4-1/,,,,,,,,,
|
| 54 |
+
claude-3-5-haiku-20241022,Anthropic,38.49,26.19,19.86,34.84,48.45,39.71,61.88,https://www.anthropic.com/claude/haiku,65.0,43.0,Claude 3.5 Haiku (20241022),1237.0,+3/-3,33322.0,Propretary,Unknown,https://www.anthropic.com/news/3-5-models-and-computer-use
|
| 55 |
+
gpt-4o-mini-2024-07-18,OpenAI,37.63,25.64,25.48,38.05,49.96,29.88,56.8,https://openai.com/index/hello-gpt-4o/,36.0,43.0,GPT-4o-mini-2024-07-18,1272.0,+2/-2,71388.0,Proprietary,2023/10,https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/
|
| 56 |
+
gemini-1.5-flash-8b-001,Google,34.37,18.69,16.48,32.17,46.27,22.87,69.72,https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash-8b,70.0,83.0,Gemini-1.5-Flash-8B-001,1212.0,+3/-3,37686.0,Proprietary,Unknown,https://aistudio.google.com/app/prompts/new_chat?instructions=lmsys&model=gemini-1.5-flash-8b
|
| 57 |
+
amazon.nova-lite-v1:0,Amazon,33.47,32.0,15.24,34.62,37.23,27.62,54.13,https://aws.amazon.com/ai/generative-ai/nova/,69.0,80.0,Amazon Nova Lite 1.0,1217.0,+4/-4,20648.0,Proprietary,Unknown,https://docs.aws.amazon.com/nova/latest/userguide/what-is-nova.html
|
| 58 |
+
azerogpt,SoundAI,32.7,24.47,15.95,31.81,33.95,30.73,59.31,https://azero.soundai.com/#/dashboard,,,,,,,,,
|
| 59 |
+
gemma-3-4b-it,Google,30.13,19.78,11.71,31.33,39.3,15.06,63.58,https://blog.google/technology/developers/gemma-3/,,,,,,,,,
|
| 60 |
+
command-r-plus-08-2024,Cohere,29.86,21.64,8.19,22.82,38.06,30.86,57.61,https://docs.cohere.com/docs/models,69.0,70.0,Command R+ (08-2024),1215.0,+6/-6,10539.0,CC-BY-NC-4.0,2024/8,https://docs.cohere.com/docs/command-r-plus#model-details
|
| 61 |
+
qwen2.5-7b-instruct-turbo,Alibaba,29.22,22.31,11.0,36.81,35.22,17.85,52.11,https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,,,,,,,,,
|
| 62 |
+
amazon.nova-micro-v1:0,Amazon,28.65,25.42,6.14,34.15,33.95,24.19,48.04,https://aws.amazon.com/ai/generative-ai/nova/,82.0,93.0,Amazon Nova Micro 1.0,1198.0,+4/-4,20663.0,Proprietary,Unknown,https://docs.aws.amazon.com/nova/latest/userguide/what-is-nova.html
|
| 63 |
+
command-r-08-2024,Cohere,26.99,20.58,6.1,18.35,33.34,27.93,55.62,https://docs.cohere.com/docs/models,89.0,84.0,Command R (08-2024),1180.0,+5/-5,10848.0,CC-BY-NC-4.0,2024/8,https://docs.cohere.com/docs/command-r-plus#model-details
|
| 64 |
+
meta-llama-3.1-8b-instruct-turbo,Meta,24.95,14.78,11.05,15.08,32.82,21.1,54.9,https://www.together.ai/blog/meta-llama-3-1,91.0,109.0,Meta-Llama-3.1-8B-Instruct,1176.0,+3/-2,52597.0,Llama 3.1 Community,2023/12,https://ai.meta.com/blog/meta-llama-3-1/
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.25.2
|
| 2 |
+
pandas
|
| 3 |
+
plotly
|
| 4 |
+
numpy
|
src/data_processing.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/data_processing.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
def load_data(file_path='data/merged_leaderboards.csv'):
|
| 8 |
+
"""Load and prepare the leaderboard data"""
|
| 9 |
+
try:
|
| 10 |
+
df = pd.read_csv(file_path) # [cite: 1]
|
| 11 |
+
|
| 12 |
+
# Replace empty strings with NaN
|
| 13 |
+
df = df.replace('', np.nan)
|
| 14 |
+
|
| 15 |
+
# Ensure numeric columns are properly typed
|
| 16 |
+
numeric_cols = [
|
| 17 |
+
'Global Average', 'Reasoning Average', 'Coding Average',
|
| 18 |
+
'Mathematics Average', 'Data Analysis Average',
|
| 19 |
+
'Language Average', 'IF Average',
|
| 20 |
+
'Arena Rank (No Style Control)', 'Arena Rank (With Style Control)',
|
| 21 |
+
'Arena Score', '# of Votes'
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
for col in numeric_cols:
|
| 25 |
+
if col in df.columns:
|
| 26 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 27 |
+
|
| 28 |
+
if '95% Confidence Interval' in df.columns:
|
| 29 |
+
df['95% Confidence Interval'] = df['95% Confidence Interval'].astype(str)
|
| 30 |
+
|
| 31 |
+
if 'Global Average' in df.columns:
|
| 32 |
+
df = df.sort_values('Global Average', ascending=False, na_position='last')
|
| 33 |
+
|
| 34 |
+
# Create Model Name (prioritize LiveBench) - Changed 'Primary Model Name' to 'Model Name'
|
| 35 |
+
if 'Model Name (LiveBench)' in df.columns and 'Model Name (Arena)' in df.columns:
|
| 36 |
+
df['Model Name'] = df['Model Name (LiveBench)'].fillna(df['Model Name (Arena)'])
|
| 37 |
+
elif 'Model Name (LiveBench)' in df.columns:
|
| 38 |
+
df['Model Name'] = df['Model Name (LiveBench)']
|
| 39 |
+
elif 'Model Name (Arena)' in df.columns:
|
| 40 |
+
df['Model Name'] = df['Model Name (Arena)']
|
| 41 |
+
else:
|
| 42 |
+
df['Model Name'] = 'Unknown Model' # Fallback
|
| 43 |
+
|
| 44 |
+
# Drop duplicates based on the primary model name to avoid issues if names clash
|
| 45 |
+
# Keep the first occurrence (which should be the highest ranked if sorted by Global Average)
|
| 46 |
+
if 'Model Name' in df.columns:
|
| 47 |
+
df = df.drop_duplicates(subset=['Model Name'], keep='first')
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
return df
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"Error loading data: {e}")
|
| 53 |
+
return pd.DataFrame()
|
| 54 |
+
|
| 55 |
+
def get_column_groups():
|
| 56 |
+
"""Define logical column groupings for the UI, including Model Mapping"""
|
| 57 |
+
# Changed model_identifier to 'Model Name' to match user's working version
|
| 58 |
+
model_identifier = "Model Name" # This should be the unified name column
|
| 59 |
+
|
| 60 |
+
column_groups = {
|
| 61 |
+
"Main Metrics": [
|
| 62 |
+
model_identifier, "Organization", "Global Average", "Reasoning Average",
|
| 63 |
+
"Coding Average", "Mathematics Average", "Data Analysis Average",
|
| 64 |
+
"Language Average", "IF Average"
|
| 65 |
+
],
|
| 66 |
+
"Model Details": [
|
| 67 |
+
model_identifier, "Organization", "Model License",
|
| 68 |
+
"Model Knowledge Cutoff", "Model Link (LiveBench)", "Model Link (Arena)" # Added Arena Link here too
|
| 69 |
+
],
|
| 70 |
+
"Community Stats": [
|
| 71 |
+
model_identifier, "Organization",
|
| 72 |
+
"Arena Rank (No Style Control)", "Arena Rank (With Style Control)",
|
| 73 |
+
"Arena Score", "95% Confidence Interval", "# of Votes"
|
| 74 |
+
# Removed: "Model Name (Arena)", "Model Link (Arena)" as they are in Model Details/Mapping
|
| 75 |
+
],
|
| 76 |
+
"Model Mapping": [
|
| 77 |
+
# MODIFIED: Added the main model_identifier here
|
| 78 |
+
model_identifier,
|
| 79 |
+
"Model Name (LiveBench)", "Model Name (Arena)",
|
| 80 |
+
"Model Link (LiveBench)", "Model Link (Arena)"
|
| 81 |
+
],
|
| 82 |
+
# All displayable columns for card view (uses internal names where needed)
|
| 83 |
+
# Ensure this list is comprehensive for the card function
|
| 84 |
+
"All Displayable": [
|
| 85 |
+
model_identifier, "Organization", "Global Average", "Reasoning Average",
|
| 86 |
+
"Coding Average", "Mathematics Average", "Data Analysis Average",
|
| 87 |
+
"Language Average", "IF Average", "Model License",
|
| 88 |
+
"Model Knowledge Cutoff", "Model Name (LiveBench)", "Model Link (LiveBench)", # Added LiveBench Name
|
| 89 |
+
"Model Name (Arena)", "Model Link (Arena)", # Added Arena Name/Link
|
| 90 |
+
"Arena Rank (No Style Control)", "Arena Rank (With Style Control)",
|
| 91 |
+
"Arena Score", "95% Confidence Interval", "# of Votes"
|
| 92 |
+
]
|
| 93 |
+
}
|
| 94 |
+
return column_groups
|
| 95 |
+
|
| 96 |
+
def filter_data(df, search, min_global, organization):
|
| 97 |
+
"""Filter the dataframe based on user inputs using Model Name"""
|
| 98 |
+
if df is None or df.empty:
|
| 99 |
+
return pd.DataFrame()
|
| 100 |
+
|
| 101 |
+
filtered_df = df.copy()
|
| 102 |
+
|
| 103 |
+
# Apply minimum global average filter
|
| 104 |
+
if 'Global Average' in filtered_df.columns and min_global > 0:
|
| 105 |
+
# Ensure comparison is valid even if column has NaNs
|
| 106 |
+
filtered_df = filtered_df[filtered_df['Global Average'].ge(min_global)]
|
| 107 |
+
|
| 108 |
+
# Apply organization filter
|
| 109 |
+
if organization and organization != "All" and 'Organization' in filtered_df.columns:
|
| 110 |
+
# Ensure comparison handles potential NaNs in the column
|
| 111 |
+
filtered_df = filtered_df[filtered_df['Organization'].eq(organization) & filtered_df['Organization'].notna()]
|
| 112 |
+
|
| 113 |
+
# Apply search filter across relevant text fields
|
| 114 |
+
if search and search.strip(): # Ensure search is not empty or just whitespace
|
| 115 |
+
search_text = re.escape(search.strip().lower()) # Use strip() and escape regex chars
|
| 116 |
+
# Initialize filter mask
|
| 117 |
+
search_filter = pd.Series([False] * len(filtered_df), index=filtered_df.index)
|
| 118 |
+
|
| 119 |
+
# List of columns to search within
|
| 120 |
+
search_cols = [
|
| 121 |
+
'Model Name', 'Organization', 'Model Name (LiveBench)', 'Model Name (Arena)'
|
| 122 |
+
]
|
| 123 |
+
|
| 124 |
+
# Apply search to existing columns only, handling potential NaNs within columns
|
| 125 |
+
for col in search_cols:
|
| 126 |
+
if col in filtered_df.columns:
|
| 127 |
+
# Convert column to string, handle NaN, convert to lower, then check contains
|
| 128 |
+
search_filter |= filtered_df[col].astype(str).str.lower().str.contains(search_text, na=False)
|
| 129 |
+
|
| 130 |
+
filtered_df = filtered_df[search_filter]
|
| 131 |
+
|
| 132 |
+
return filtered_df
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def get_organization_list(df):
|
| 136 |
+
"""Get sorted list of unique organizations for dropdown"""
|
| 137 |
+
if df is None or 'Organization' not in df.columns:
|
| 138 |
+
return ["All"]
|
| 139 |
+
# Drop NaNs, get unique, convert to string, sort, and prepend "All"
|
| 140 |
+
orgs = df['Organization'].dropna().unique().tolist()
|
| 141 |
+
orgs = sorted([str(org) for org in orgs if org]) # Ensure sorting works on strings
|
| 142 |
+
return ["All"] + orgs
|
| 143 |
+
|
| 144 |
+
def get_top_models(df, column='Global Average', n=15):
|
| 145 |
+
"""Get top N models by a specific metric, handling sort direction."""
|
| 146 |
+
if df is None or df.empty or column not in df.columns:
|
| 147 |
+
return pd.DataFrame()
|
| 148 |
+
|
| 149 |
+
# Determine sort order: ascending for ranks, descending otherwise
|
| 150 |
+
ascending_sort = 'Rank' in column # True if 'Rank' is in the column name
|
| 151 |
+
|
| 152 |
+
# Sort, handle NaNs in the sorting column, and take top N
|
| 153 |
+
# na_position='last' ensures models without a score/rank are at the bottom
|
| 154 |
+
return df.sort_values(column, ascending=ascending_sort, na_position='last').head(n)
|
src/ui.py
ADDED
|
@@ -0,0 +1,734 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/ui.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from src.data_processing import (
|
| 7 |
+
load_data, filter_data, get_organization_list,
|
| 8 |
+
get_column_groups, get_top_models
|
| 9 |
+
)
|
| 10 |
+
import numpy as np # Needed for scaling check
|
| 11 |
+
|
| 12 |
+
# --- Global Setup ---
|
| 13 |
+
try:
|
| 14 |
+
df_global = load_data()
|
| 15 |
+
if df_global.empty:
|
| 16 |
+
print("Warning: Dataframe is empty after loading.")
|
| 17 |
+
# Add Arena Score default if data loading creates an empty frame
|
| 18 |
+
df_global = pd.DataFrame(columns=['Model Name', 'Organization', 'Global Average', 'Arena Score'])
|
| 19 |
+
|
| 20 |
+
column_groups_global = get_column_groups()
|
| 21 |
+
# Consistent model identifier from data_processing
|
| 22 |
+
# Safely get the identifier, default to 'Model Name' if group is missing/empty
|
| 23 |
+
model_identifier_col = column_groups_global.get("Main Metrics", ["Model Name"])[0] if column_groups_global.get("Main Metrics") else "Model Name"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# Define base radar metrics
|
| 27 |
+
base_radar_metrics = [
|
| 28 |
+
"Reasoning Average", "Coding Average", "Mathematics Average",
|
| 29 |
+
"Data Analysis Average", "Language Average", "IF Average"
|
| 30 |
+
]
|
| 31 |
+
# Add Arena Score if available in the data
|
| 32 |
+
if 'Arena Score' in df_global.columns:
|
| 33 |
+
base_radar_metrics.append('Arena Score')
|
| 34 |
+
|
| 35 |
+
# Filter to metrics actually present
|
| 36 |
+
radar_metrics = [m for m in base_radar_metrics if m in df_global.columns]
|
| 37 |
+
|
| 38 |
+
# Calculate global min/max for Arena Score scaling (handle NaNs and potential lack of column)
|
| 39 |
+
global_arena_min = np.nan
|
| 40 |
+
global_arena_max = np.nan
|
| 41 |
+
if 'Arena Score' in df_global.columns:
|
| 42 |
+
# Ensure the column exists before trying to access it
|
| 43 |
+
if not df_global['Arena Score'].isna().all(): # Check if there are any non-NaN values
|
| 44 |
+
global_arena_min = df_global['Arena Score'].min(skipna=True)
|
| 45 |
+
global_arena_max = df_global['Arena Score'].max(skipna=True)
|
| 46 |
+
# Handle case where min/max might still be NaN (e.g., column exists but all values are NaN)
|
| 47 |
+
if pd.isna(global_arena_min) or pd.isna(global_arena_max):
|
| 48 |
+
global_arena_min = 0 # Default scale if calculation fails
|
| 49 |
+
global_arena_max = 1 # Avoid division by zero later
|
| 50 |
+
elif global_arena_max == global_arena_min:
|
| 51 |
+
# Avoid division by zero if all scores are the same
|
| 52 |
+
global_arena_max += 1 # Or set a default range like 0-100? Adjust as needed.
|
| 53 |
+
else:
|
| 54 |
+
# Column exists but all values are NaN
|
| 55 |
+
global_arena_min = 0
|
| 56 |
+
global_arena_max = 1
|
| 57 |
+
else:
|
| 58 |
+
# Arena Score column doesn't exist
|
| 59 |
+
global_arena_min = 0
|
| 60 |
+
global_arena_max = 1
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"Critical error during global setup: {e}")
|
| 65 |
+
df_global = pd.DataFrame({'Error': [f'Failed to load data: {e}']})
|
| 66 |
+
column_groups_global = {}
|
| 67 |
+
model_identifier_col = 'Error'
|
| 68 |
+
radar_metrics = []
|
| 69 |
+
global_arena_min, global_arena_max = 0, 1 # Default values on error
|
| 70 |
+
|
| 71 |
+
# --- Helper Functions ---
|
| 72 |
+
def get_valid_columns(df, column_list):
|
| 73 |
+
if df is None or column_list is None: return []
|
| 74 |
+
# Return only columns that exist in the dataframe
|
| 75 |
+
return [col for col in column_list if col in df.columns]
|
| 76 |
+
|
| 77 |
+
# --- UI Component Creation Functions ---
|
| 78 |
+
# create_model_card remains the same as previous version
|
| 79 |
+
def create_model_card(model_data):
|
| 80 |
+
"""Create a detailed card view for a single model"""
|
| 81 |
+
if not isinstance(model_data, dict): return "<p>Error: Invalid model data format.</p>"
|
| 82 |
+
|
| 83 |
+
# Use the globally defined identifier
|
| 84 |
+
model_name = model_data.get(model_identifier_col, 'Unknown Model')
|
| 85 |
+
|
| 86 |
+
# Helper to safely get data, handling potential NaNs or missing keys
|
| 87 |
+
def get_data(key, default='N/A'):
|
| 88 |
+
val = model_data.get(key)
|
| 89 |
+
# Check for NaN specifically with pandas, otherwise just check for None
|
| 90 |
+
return default if pd.isna(val) else val
|
| 91 |
+
|
| 92 |
+
# Determine the primary link, prioritizing LiveBench then Arena
|
| 93 |
+
model_link_livebench = get_data('Model Link (LiveBench)', default=None)
|
| 94 |
+
model_link_arena = get_data('Model Link (Arena)', default=None)
|
| 95 |
+
# Ensure we don't use 'N/A' as a link
|
| 96 |
+
model_link = None
|
| 97 |
+
if model_link_livebench and model_link_livebench != 'N/A':
|
| 98 |
+
model_link = model_link_livebench
|
| 99 |
+
elif model_link_arena and model_link_arena != 'N/A':
|
| 100 |
+
model_link = model_link_arena
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# Build the card HTML structure
|
| 104 |
+
card_html = f"""
|
| 105 |
+
<div class="model-card">
|
| 106 |
+
<h3>{model_name}</h3>
|
| 107 |
+
<h4>by {get_data('Organization')}</h4>
|
| 108 |
+
<div class="metrics">
|
| 109 |
+
<div class="metric-group">
|
| 110 |
+
<h5>Performance Metrics</h5>
|
| 111 |
+
<table class="metric-table">
|
| 112 |
+
<tr><td>Global Average:</td><td><b>{get_data('Global Average', 'N/A')}</b></td></tr>
|
| 113 |
+
<tr><td>Reasoning:</td><td>{get_data('Reasoning Average', 'N/A')}</td></tr>
|
| 114 |
+
<tr><td>Coding:</td><td>{get_data('Coding Average', 'N/A')}</td></tr>
|
| 115 |
+
<tr><td>Mathematics:</td><td>{get_data('Mathematics Average', 'N/A')}</td></tr>
|
| 116 |
+
<tr><td>Data Analysis:</td><td>{get_data('Data Analysis Average', 'N/A')}</td></tr>
|
| 117 |
+
<tr><td>Language:</td><td>{get_data('Language Average', 'N/A')}</td></tr>
|
| 118 |
+
<tr><td>IF Average:</td><td>{get_data('IF Average', 'N/A')}</td></tr>
|
| 119 |
+
</table>
|
| 120 |
+
</div>
|
| 121 |
+
<div class="metric-group">
|
| 122 |
+
<h5>Community Data (Arena)</h5>
|
| 123 |
+
<table class="metric-table">
|
| 124 |
+
<tr><td>Rank (No Style Ctrl):</td><td>{get_data('Arena Rank (No Style Control)', 'N/A')}</td></tr>
|
| 125 |
+
<tr><td>Rank (Style Ctrl):</td><td>{get_data('Arena Rank (With Style Control)', 'N/A')}</td></tr>
|
| 126 |
+
<tr><td>Arena Score:</td><td>{get_data('Arena Score', 'N/A')}</td></tr>
|
| 127 |
+
<tr><td>Confidence Interval:</td><td>{get_data('95% Confidence Interval', 'N/A')}</td></tr>
|
| 128 |
+
<tr><td># of Votes:</td><td>{get_data('# of Votes', 'N/A')}</td></tr>
|
| 129 |
+
</table>
|
| 130 |
+
</div>
|
| 131 |
+
</div>
|
| 132 |
+
<div class="model-details">
|
| 133 |
+
<h5>Model Information</h5>
|
| 134 |
+
<table class="detail-table">
|
| 135 |
+
<tr><td>License:</td><td>{get_data('Model License', 'N/A')}</td></tr>
|
| 136 |
+
<tr><td>Knowledge Cutoff:</td><td>{get_data('Model Knowledge Cutoff', 'N/A')}</td></tr>
|
| 137 |
+
{f"<tr><td>LiveBench Name:</td><td>{get_data('Model Name (LiveBench)', 'N/A')}</td></tr>" if get_data('Model Name (LiveBench)', default=None) else ""}
|
| 138 |
+
{f"<tr><td>Arena Name:</td><td>{get_data('Model Name (Arena)', 'N/A')}</td></tr>" if get_data('Model Name (Arena)', default=None) else ""}
|
| 139 |
+
{f"<tr><td>LiveBench Link:</td><td><a href='{model_link_livebench}' target='_blank'>{model_link_livebench}</a></td></tr>" if model_link_livebench and model_link_livebench != 'N/A' else ""}
|
| 140 |
+
{f"<tr><td>Arena Link:</td><td><a href='{model_link_arena}' target='_blank'>{model_link_arena}</a></td></tr>" if model_link_arena and model_link_arena != 'N/A' else ""}
|
| 141 |
+
</table>
|
| 142 |
+
</div>
|
| 143 |
+
"""
|
| 144 |
+
# Add the "Learn More" button only if a valid link was found
|
| 145 |
+
if model_link:
|
| 146 |
+
card_html += f"""<div class="model-link"><a href="{model_link}" target="_blank" rel="noopener noreferrer">Learn More</a></div>"""
|
| 147 |
+
|
| 148 |
+
card_html += """</div>""" # Close model-card div
|
| 149 |
+
return card_html
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# create_comparison_chart remains the same as previous version
|
| 153 |
+
def create_comparison_chart(df, metric):
|
| 154 |
+
"""Create a bar chart comparing models on a specific metric"""
|
| 155 |
+
if df is None or df.empty or metric not in df.columns:
|
| 156 |
+
return go.Figure().update_layout(title=f"No data available for {metric}", xaxis_title="Model", yaxis_title=metric)
|
| 157 |
+
|
| 158 |
+
# Use get_top_models which handles sorting correctly (ascending for ranks)
|
| 159 |
+
top_df = get_top_models(df, metric, n=15)
|
| 160 |
+
|
| 161 |
+
if top_df.empty:
|
| 162 |
+
return go.Figure().update_layout(title=f"No models found for {metric}", xaxis_title="Model", yaxis_title=metric)
|
| 163 |
+
|
| 164 |
+
# Determine category order based on whether it's a rank
|
| 165 |
+
category_order = 'total ascending' if 'Rank' in metric else 'total descending'
|
| 166 |
+
|
| 167 |
+
# --- Define columns needed for hover info ---
|
| 168 |
+
# Must include the x-axis identifier and any other desired fields
|
| 169 |
+
custom_data_potential_cols = [
|
| 170 |
+
model_identifier_col, # Index 0 (Model Name)
|
| 171 |
+
'Organization', # Index 1
|
| 172 |
+
'Arena Score', # Index 2
|
| 173 |
+
'Arena Rank (No Style Control)', # Index 3
|
| 174 |
+
'Global Average' # Index 4 (Example: Add Global Average)
|
| 175 |
+
]
|
| 176 |
+
# Filter to columns that actually exist in top_df
|
| 177 |
+
valid_custom_data_cols = [col for col in custom_data_potential_cols if col in top_df.columns]
|
| 178 |
+
|
| 179 |
+
fig = px.bar(
|
| 180 |
+
top_df,
|
| 181 |
+
x=model_identifier_col,
|
| 182 |
+
y=metric,
|
| 183 |
+
color='Organization' if 'Organization' in top_df.columns else None,
|
| 184 |
+
title=f'Top 15 Models by {metric}',
|
| 185 |
+
labels={model_identifier_col: "Model Name", "Organization": "Organization", metric: metric}, # Dynamic Y label
|
| 186 |
+
custom_data=valid_custom_data_cols, # Pass only existing columns
|
| 187 |
+
height=500
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
fig.update_layout(
|
| 191 |
+
xaxis_title="Model Name",
|
| 192 |
+
yaxis_title=metric,
|
| 193 |
+
xaxis={'categoryorder': category_order},
|
| 194 |
+
plot_bgcolor='rgba(240, 240, 240, 0.8)',
|
| 195 |
+
margin=dict(l=40, r=20, t=60, b=120), # Adjusted bottom margin for tilted labels
|
| 196 |
+
)
|
| 197 |
+
fig.update_xaxes(tickangle=45) # Tilt labels for better readability
|
| 198 |
+
|
| 199 |
+
# --- Build Hovertemplate using custom_data indices ---
|
| 200 |
+
template_parts = []
|
| 201 |
+
# Helper to safely add parts to template
|
| 202 |
+
def add_template_part(col_name, display_name, formatting=""):
|
| 203 |
+
if col_name in valid_custom_data_cols:
|
| 204 |
+
try:
|
| 205 |
+
idx = valid_custom_data_cols.index(col_name)
|
| 206 |
+
template_parts.append(f"<b>{display_name}:</b> %{{customdata[{idx}]{formatting}}}<br>")
|
| 207 |
+
except (ValueError, IndexError): pass # Should not happen if check passes
|
| 208 |
+
|
| 209 |
+
# Add parts based on valid columns
|
| 210 |
+
add_template_part(model_identifier_col, "Model")
|
| 211 |
+
add_template_part('Organization', "Organization")
|
| 212 |
+
template_parts.append(f"<b>{metric}:</b> %{{y:,.2f}}<br>") # Always show the main metric from y-axis
|
| 213 |
+
|
| 214 |
+
# Add other metrics if they exist and are not the main metric being plotted
|
| 215 |
+
if 'Global Average' in valid_custom_data_cols and 'Global Average' != metric:
|
| 216 |
+
add_template_part('Global Average', "Global Avg", ":,.2f")
|
| 217 |
+
if 'Arena Score' in valid_custom_data_cols and 'Arena Score' != metric:
|
| 218 |
+
add_template_part('Arena Score', "Arena Score", ":,.0f")
|
| 219 |
+
if 'Arena Rank (No Style Control)' in valid_custom_data_cols and 'Arena Rank (No Style Control)' != metric:
|
| 220 |
+
add_template_part('Arena Rank (No Style Control)', "Arena Rank", ":,.0f")
|
| 221 |
+
|
| 222 |
+
template_parts.append("<extra></extra>") # Remove trace info
|
| 223 |
+
hovertemplate = "".join(template_parts)
|
| 224 |
+
|
| 225 |
+
# Apply the hovertemplate
|
| 226 |
+
fig.update_traces(hovertemplate=hovertemplate)
|
| 227 |
+
|
| 228 |
+
return fig
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
# create_radar_chart remains the same as previous version
|
| 232 |
+
def create_radar_chart(df, model_names, metrics):
|
| 233 |
+
"""Create a radar chart comparing multiple models across metrics, including scaled Arena Score"""
|
| 234 |
+
# Input validation
|
| 235 |
+
if df is None or df.empty or not model_names or not metrics:
|
| 236 |
+
return go.Figure().update_layout(title="Select models and ensure metrics exist", polar=dict(radialaxis=dict(visible=False)))
|
| 237 |
+
|
| 238 |
+
# Ensure metrics requested are valid and exist in the dataframe
|
| 239 |
+
valid_metrics = [m for m in metrics if m in df.columns]
|
| 240 |
+
if not valid_metrics:
|
| 241 |
+
return go.Figure().update_layout(title="None of the selected metrics exist in the data", polar=dict(radialaxis=dict(visible=False)))
|
| 242 |
+
|
| 243 |
+
# Filter data for selected models
|
| 244 |
+
# Use .loc to avoid SettingWithCopyWarning if modifications are made later
|
| 245 |
+
filtered_df = df.loc[df[model_identifier_col].isin(model_names)].copy()
|
| 246 |
+
if filtered_df.empty:
|
| 247 |
+
return go.Figure().update_layout(title="Selected models not found in data", polar=dict(radialaxis=dict(visible=False)))
|
| 248 |
+
|
| 249 |
+
fig = go.Figure()
|
| 250 |
+
radar_plot_metrics = [] # Final list of metric labels for the chart axes
|
| 251 |
+
|
| 252 |
+
# Process each selected model
|
| 253 |
+
processed_models_count = 0 # Keep track of models successfully added
|
| 254 |
+
for _, row in filtered_df.iterrows():
|
| 255 |
+
model_values = []
|
| 256 |
+
current_radar_metrics = [] # Track metrics used for this model row
|
| 257 |
+
|
| 258 |
+
for metric in valid_metrics: # Use only valid metrics
|
| 259 |
+
metric_label = metric # Default label
|
| 260 |
+
value = 0 # Default value if missing or invalid
|
| 261 |
+
|
| 262 |
+
if metric == 'Arena Score':
|
| 263 |
+
metric_label = 'Arena Score (Scaled)' # Use distinct label
|
| 264 |
+
if 'Arena Score' in row and pd.notna(row['Arena Score']):
|
| 265 |
+
score = row['Arena Score']
|
| 266 |
+
# Scale score using global min/max (check for division by zero)
|
| 267 |
+
if global_arena_max > global_arena_min:
|
| 268 |
+
scaled_score = ((score - global_arena_min) / (global_arena_max - global_arena_min)) * 100
|
| 269 |
+
value = np.clip(scaled_score, 0, 100) # Clip to 0-100 range
|
| 270 |
+
else:
|
| 271 |
+
value = 50 # Neutral value if scaling is not possible
|
| 272 |
+
# else: value remains 0 (default for missing Arena Score)
|
| 273 |
+
elif metric in row and pd.notna(row[metric]):
|
| 274 |
+
# Standard metric handling
|
| 275 |
+
numeric_val = pd.to_numeric(row[metric], errors='coerce')
|
| 276 |
+
if pd.notna(numeric_val):
|
| 277 |
+
value = np.clip(numeric_val, 0, 100) # Clip other metrics to 0-100
|
| 278 |
+
# else: value remains 0 (default for non-numeric or NaN)
|
| 279 |
+
|
| 280 |
+
# Append value and the corresponding label
|
| 281 |
+
model_values.append(value)
|
| 282 |
+
current_radar_metrics.append(metric_label)
|
| 283 |
+
|
| 284 |
+
# Use the metrics from the first processed model as the canonical set for theta
|
| 285 |
+
if not radar_plot_metrics:
|
| 286 |
+
radar_plot_metrics = current_radar_metrics
|
| 287 |
+
|
| 288 |
+
# Add trace for this model if data was processed and matches expected metrics
|
| 289 |
+
if model_values and len(model_values) == len(radar_plot_metrics):
|
| 290 |
+
r_values = model_values + [model_values[0]] # Close the polygon
|
| 291 |
+
theta_values = radar_plot_metrics + [radar_plot_metrics[0]] # Close theta polygon
|
| 292 |
+
|
| 293 |
+
fig.add_trace(go.Scatterpolar(
|
| 294 |
+
r=r_values,
|
| 295 |
+
theta=theta_values,
|
| 296 |
+
fill='toself',
|
| 297 |
+
name=row[model_identifier_col], # Model name for the legend
|
| 298 |
+
hoverinfo='text', # Use custom hover text
|
| 299 |
+
# Define hover text for each point on the radar
|
| 300 |
+
text=[f"{metric_label}: {val:.2f}" for metric_label, val in zip(radar_plot_metrics, model_values)] + [f"{radar_plot_metrics[0]}: {model_values[0]:.2f}"] # Text for hover
|
| 301 |
+
))
|
| 302 |
+
processed_models_count += 1
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
# Final figure layout adjustments
|
| 306 |
+
if processed_models_count == 0: # If no traces were added
|
| 307 |
+
return go.Figure().update_layout(title="No data to display for selected models/metrics", polar=dict(radialaxis=dict(visible=False)))
|
| 308 |
+
|
| 309 |
+
fig.update_layout(
|
| 310 |
+
polar=dict(
|
| 311 |
+
radialaxis=dict(
|
| 312 |
+
visible=True,
|
| 313 |
+
range=[0, 100] # Fixed range due to clipping/scaling
|
| 314 |
+
)
|
| 315 |
+
),
|
| 316 |
+
showlegend=True,
|
| 317 |
+
legend_title_text='Model Name',
|
| 318 |
+
title="Model Comparison Across Metrics",
|
| 319 |
+
height=600
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
return fig
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
# --- Main UI Function ---
|
| 326 |
+
|
| 327 |
+
def create_leaderboard_ui():
|
| 328 |
+
"""Create the main Gradio UI"""
|
| 329 |
+
css = """
|
| 330 |
+
/* General Layout & Styling */
|
| 331 |
+
.gradio-container { max-width: 95% !important; margin: 0 auto !important;}
|
| 332 |
+
.container { max-width: none; margin: 0 auto; padding: 0 15px; }
|
| 333 |
+
.header { text-align: center; margin-bottom: 1rem; } /* Reduced margin */
|
| 334 |
+
.intro-text { max-width: 800px; margin: 0 auto 2rem auto; padding: 15px; background-color: #f8f9fa; border-radius: 8px; text-align: left; font-size: 0.95em; line-height: 1.6; border: 1px solid #e9ecef; }
|
| 335 |
+
.intro-text h4 { margin-top: 0; margin-bottom: 10px; color: #4a69bd; }
|
| 336 |
+
.intro-text p { margin-bottom: 10px; }
|
| 337 |
+
.intro-text ul { margin-left: 20px; margin-bottom: 10px; }
|
| 338 |
+
.intro-text a { color: #3b5998; text-decoration: none; }
|
| 339 |
+
.intro-text a:hover { text-decoration: underline; }
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
/* Filter Container */
|
| 343 |
+
.filter-container { background-color: #f5f7fa; border-radius: 10px; padding: 15px; margin-bottom: 20px; box-shadow: 0 2px 5px rgba(0,0,0,0.05); }
|
| 344 |
+
|
| 345 |
+
/* Model Card Styling */
|
| 346 |
+
.model-card { background: white; border-radius: 8px; padding: 20px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); margin-bottom: 20px; border: 1px solid #eee; }
|
| 347 |
+
.model-card h3 { margin-top: 0; color: #333; }
|
| 348 |
+
.model-card h4 { margin-bottom: 15px; color: #555; font-weight: normal; }
|
| 349 |
+
.metrics { display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 20px; margin: 15px 0; }
|
| 350 |
+
.metric-group h5 { margin-bottom: 10px; border-bottom: 1px solid #eee; padding-bottom: 5px; color: #4a69bd; font-size: 1em; }
|
| 351 |
+
.metric-table, .detail-table { width: 100%; border-collapse: collapse; font-size: 0.9em; }
|
| 352 |
+
.metric-table td, .detail-table td { padding: 5px 0; vertical-align: top; }
|
| 353 |
+
.metric-table td:first-child { color: #555; width: 60%; }
|
| 354 |
+
.metric-table td:last-child { font-weight: bold; text-align: right; }
|
| 355 |
+
.detail-table td:first-child { color: #555; width: 30%;}
|
| 356 |
+
.detail-table td:last-child { width: 70%; word-wrap: break-word; } /* Allow long details to wrap */
|
| 357 |
+
.model-details h5 { margin-top: 20px; margin-bottom: 10px; border-bottom: 1px solid #eee; padding-bottom: 5px; color: #4a69bd; font-size: 1em; }
|
| 358 |
+
.model-link { margin-top: 15px; text-align: right; }
|
| 359 |
+
.model-link a { display: inline-block; background: #4a69bd; color: white !important; padding: 8px 16px; border-radius: 4px; text-decoration: none; font-weight: bold; font-size: 0.9em; transition: background-color 0.2s ease; }
|
| 360 |
+
.model-link a:hover { background: #3b549a; }
|
| 361 |
+
|
| 362 |
+
/* DataFrame Table Styling */
|
| 363 |
+
.gradio-dataframe { overflow-x: auto; } /* Add horizontal scroll if needed */
|
| 364 |
+
.gradio-dataframe table.dataframe { table-layout: auto; width: 100%; border-collapse: collapse; border: 1px solid #ddd; font-size: 0.9em; }
|
| 365 |
+
.gradio-dataframe table.dataframe th, .gradio-dataframe table.dataframe td { white-space: normal; word-wrap: break-word; padding: 8px 10px; text-align: left; vertical-align: top; }
|
| 366 |
+
.gradio-dataframe table.dataframe th { background-color: #f2f2f2; border-bottom: 2px solid #ddd; font-weight: bold; }
|
| 367 |
+
.gradio-dataframe table.dataframe td { border-bottom: 1px solid #eee; }
|
| 368 |
+
.gradio-dataframe table.dataframe tr:nth-child(even) { background-color: #f9f9f9; }
|
| 369 |
+
/* REMOVED: .gradio-dataframe table.dataframe tr:hover { background-color: #e9f7ef; } */ /* Removed hover effect */
|
| 370 |
+
|
| 371 |
+
/* Attempt to hide the multi-cell selection visual */
|
| 372 |
+
.gradio-dataframe table.dataframe td::selection {
|
| 373 |
+
background-color: transparent; /* Make selection background invisible */
|
| 374 |
+
color: inherit; /* Keep text color the same */
|
| 375 |
+
}
|
| 376 |
+
.gradio-dataframe table.dataframe td::-moz-selection { /* Firefox */
|
| 377 |
+
background-color: transparent;
|
| 378 |
+
color: inherit;
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
/* Plot Styling */
|
| 383 |
+
.gradio-plot { min-height: 400px; padding-top: 10px; }
|
| 384 |
+
|
| 385 |
+
/* Text Selection Control */
|
| 386 |
+
/* Allow selection everywhere by default */
|
| 387 |
+
body, .gradio-container {
|
| 388 |
+
user-select: text;
|
| 389 |
+
-webkit-user-select: text;
|
| 390 |
+
-ms-user-select: text;
|
| 391 |
+
}
|
| 392 |
+
/* Ensure dataframes and model cards specifically allow text selection */
|
| 393 |
+
/* This allows copying text content */
|
| 394 |
+
.gradio-dataframe, .model-card {
|
| 395 |
+
user-select: text !important;
|
| 396 |
+
-webkit-user-select: text !important;
|
| 397 |
+
-ms-user-select: text !important;
|
| 398 |
+
}
|
| 399 |
+
"""
|
| 400 |
+
|
| 401 |
+
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue)) as app:
|
| 402 |
+
# --- Header and Introduction ---
|
| 403 |
+
gr.HTML("""<div class="header"><h1>LLM Leaderboard Explorer</h1><p>Interactive visualization of merged leaderboards data</p></div>""")
|
| 404 |
+
|
| 405 |
+
# --- ADDED: Introduction Section ---
|
| 406 |
+
gr.Markdown("""
|
| 407 |
+
<div class="intro-text">
|
| 408 |
+
<h4>Welcome!</h4>
|
| 409 |
+
<p>This application provides an interactive view of combined data from two leading LLM evaluation platforms:</p>
|
| 410 |
+
<ul>
|
| 411 |
+
<li><a href="https://livebench.ai/#/" target="_blank">LiveBench</a>: LiveBench is a dynamic benchmark, featuring monthly updated, contamination-free tasks with objective scoring across diverse domains.</li>
|
| 412 |
+
<li><a href="https://huggingface.co/spaces/lmarena-ai/chatbot-arena-leaderboard" target="_blank">LMSYS Chatbot Arena</a>: Uses crowd-sourced human preferences (Elo ratings) to rank models based on conversation quality.</li>
|
| 413 |
+
</ul>
|
| 414 |
+
<h4>Key Data Points:</h4>
|
| 415 |
+
<ul>
|
| 416 |
+
<li><b>Performance Metrics (LiveBench):</b> Includes 'Global Average' score and specific capability scores like 'Reasoning', 'Coding', 'Mathematics', etc. Higher is generally better.</li>
|
| 417 |
+
<li><b>Community Stats (LMSYS):</b> Features 'Arena Score' (Elo rating) and corresponding Ranks. Higher scores/lower ranks are better.</li>
|
| 418 |
+
<li><b>Model Details:</b> Provides information like Organization, License, and Knowledge Cutoff date.</li>
|
| 419 |
+
</ul>
|
| 420 |
+
<h4>How to Use This App:</h4>
|
| 421 |
+
<ul>
|
| 422 |
+
<li><b>Filter & Search:</b> Use the controls above the tabs to search for models or filter by organization and minimum 'Global Average' score.</li>
|
| 423 |
+
<li><b>Explore Tabs:</b> View different slices of the data (Performance, Details, Community Stats, Mapping).</li>
|
| 424 |
+
<li><b>View Model Card:</b> Click on any row in the tables (except in the Visualizations tab) to see a detailed card with all metrics for that model.</li>
|
| 425 |
+
<li><b>Visualize & Compare:</b> Use the 'Visualizations' tab to compare top models on specific metrics (Bar Chart) or compare selected models across multiple dimensions (Radar Chart).</li>
|
| 426 |
+
</ul>
|
| 427 |
+
</div>
|
| 428 |
+
""", elem_classes="intro-text-container") # Added elem_classes for potential container styling if needed
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
# --- Filter Components ---
|
| 432 |
+
with gr.Row(elem_classes="filter-container"):
|
| 433 |
+
search_input = gr.Textbox(label="Search Models/Org", placeholder="Search...", show_label=False, scale=2)
|
| 434 |
+
org_choices = get_organization_list(df_global) # Get org list once
|
| 435 |
+
org_dropdown = gr.Dropdown(choices=org_choices, label="Organization", value="All", show_label=False, scale=1, min_width=160)
|
| 436 |
+
max_slider_val = 100
|
| 437 |
+
if 'Global Average' in df_global.columns and not df_global['Global Average'].isna().all():
|
| 438 |
+
# Calculate max based on data, ensuring it's at least 100
|
| 439 |
+
max_slider_val = max(100, df_global['Global Average'].max(skipna=True))
|
| 440 |
+
min_score_slider = gr.Slider(minimum=0, maximum=max_slider_val, value=0, label="Min Global Avg", step=1, scale=2, min_width=200)
|
| 441 |
+
|
| 442 |
+
# --- Tabbed Interface ---
|
| 443 |
+
with gr.Tabs() as tabs:
|
| 444 |
+
# Get column sets safely, handling potential missing groups or empty lists
|
| 445 |
+
main_metrics_cols = get_valid_columns(df_global, column_groups_global.get("Main Metrics", []))
|
| 446 |
+
model_details_cols = get_valid_columns(df_global, column_groups_global.get("Model Details", []))
|
| 447 |
+
community_stats_cols = get_valid_columns(df_global, column_groups_global.get("Community Stats", []))
|
| 448 |
+
model_mapping_cols = get_valid_columns(df_global, column_groups_global.get("Model Mapping", [])) # Now includes 'Model Name'
|
| 449 |
+
|
| 450 |
+
# Tab 1: Performance Metrics
|
| 451 |
+
with gr.TabItem("Performance Metrics"):
|
| 452 |
+
main_metrics_table = gr.DataFrame(
|
| 453 |
+
interactive=True, # Enable selection
|
| 454 |
+
wrap=True,
|
| 455 |
+
elem_classes="gradio-dataframe"
|
| 456 |
+
)
|
| 457 |
+
gr.Markdown("Click a row for details.")
|
| 458 |
+
main_metrics_card_output = gr.HTML() # Output for this tab's card
|
| 459 |
+
|
| 460 |
+
# Tab 2: Model Details
|
| 461 |
+
with gr.TabItem("Model Details"):
|
| 462 |
+
model_details_table = gr.DataFrame(
|
| 463 |
+
interactive=True, # Enable selection
|
| 464 |
+
wrap=True,
|
| 465 |
+
elem_classes="gradio-dataframe"
|
| 466 |
+
)
|
| 467 |
+
gr.Markdown("Click a row for details.")
|
| 468 |
+
model_details_card_output = gr.HTML() # Output for this tab's card
|
| 469 |
+
|
| 470 |
+
# Tab 3: Community Stats
|
| 471 |
+
with gr.TabItem("Community Stats"):
|
| 472 |
+
community_stats_table = gr.DataFrame(
|
| 473 |
+
interactive=True, # Enable selection
|
| 474 |
+
wrap=True,
|
| 475 |
+
elem_classes="gradio-dataframe"
|
| 476 |
+
)
|
| 477 |
+
gr.Markdown("Click a row for details.")
|
| 478 |
+
community_stats_card_output = gr.HTML() # Output for this tab's card
|
| 479 |
+
|
| 480 |
+
# Tab 4: Model Mapping
|
| 481 |
+
with gr.TabItem("Model Mapping"):
|
| 482 |
+
model_mapping_table = gr.DataFrame(
|
| 483 |
+
interactive=True, # Enable selection
|
| 484 |
+
wrap=True,
|
| 485 |
+
elem_classes="gradio-dataframe"
|
| 486 |
+
)
|
| 487 |
+
gr.Markdown("Click a row for details.")
|
| 488 |
+
model_mapping_card_output = gr.HTML() # Output for this tab's card
|
| 489 |
+
|
| 490 |
+
# Tab 5: Visualizations (No selection needed here)
|
| 491 |
+
with gr.TabItem("Visualizations"):
|
| 492 |
+
gr.Markdown("### Compare Top Models by Metric")
|
| 493 |
+
with gr.Row():
|
| 494 |
+
# Define potential metrics for the bar chart
|
| 495 |
+
perf_metrics = ["Global Average", "Reasoning Average", "Coding Average", "Mathematics Average", "Data Analysis Average", "Language Average", "IF Average"]
|
| 496 |
+
arena_metrics = ["Arena Score", "Arena Rank (No Style Control)", "Arena Rank (With Style Control)", "# of Votes"]
|
| 497 |
+
# Combine and filter based on actual columns in the dataframe
|
| 498 |
+
available_bar_metrics = perf_metrics + [m for m in arena_metrics if m in df_global.columns]
|
| 499 |
+
valid_bar_metrics = get_valid_columns(df_global, available_bar_metrics)
|
| 500 |
+
# Set default value safely
|
| 501 |
+
default_bar_metric = valid_bar_metrics[0] if valid_bar_metrics else None
|
| 502 |
+
metric_dropdown = gr.Dropdown(choices=valid_bar_metrics, label="Select Metric for Bar Chart", value=default_bar_metric)
|
| 503 |
+
bar_chart = gr.Plot(label="Top 15 Model Comparison", elem_classes="gradio-plot")
|
| 504 |
+
|
| 505 |
+
gr.Markdown("### Radar Chart Comparison")
|
| 506 |
+
with gr.Row():
|
| 507 |
+
# Get model choices from the identifier column
|
| 508 |
+
model_choices = []
|
| 509 |
+
if model_identifier_col in df_global.columns and not df_global[model_identifier_col].isna().all():
|
| 510 |
+
model_choices = sorted(df_global[model_identifier_col].dropna().unique().tolist())
|
| 511 |
+
|
| 512 |
+
# Determine default models for radar chart (e.g., top 3 by Global Average)
|
| 513 |
+
default_radar_models = []
|
| 514 |
+
if 'Global Average' in df_global.columns and not df_global.empty and model_identifier_col in df_global.columns:
|
| 515 |
+
# Ensure we don't select more models than exist and handle potential NaNs
|
| 516 |
+
n_top = min(3, len(df_global.dropna(subset=['Global Average', model_identifier_col])))
|
| 517 |
+
if n_top > 0:
|
| 518 |
+
default_radar_models = df_global.nlargest(n_top, 'Global Average')[model_identifier_col].tolist()
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
models_multiselect = gr.Dropdown(choices=model_choices, label="Select Models for Radar Chart (up to 5)", multiselect=True, max_choices=5, value=default_radar_models)
|
| 522 |
+
radar_chart = gr.Plot(label="Radar Comparison", elem_classes="gradio-plot")
|
| 523 |
+
|
| 524 |
+
# --- Event Handling --- (Keep this section as it was in the previous correct version) ---
|
| 525 |
+
|
| 526 |
+
# Function to update all tables and plots based on filters
|
| 527 |
+
def update_on_filter(search, min_global, organization):
|
| 528 |
+
filtered_df = filter_data(df_global, search, min_global, organization)
|
| 529 |
+
|
| 530 |
+
# Prepare data for each table (handle empty filtered results)
|
| 531 |
+
def get_table_data(df, cols):
|
| 532 |
+
# Ensure cols is a list even if None/empty
|
| 533 |
+
valid_cols = [c for c in cols if c in df.columns] if cols else []
|
| 534 |
+
return df[valid_cols].fillna("N/A") if not df.empty and valid_cols else pd.DataFrame(columns=valid_cols)
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
main_metrics_data = get_table_data(filtered_df, main_metrics_cols)
|
| 538 |
+
model_details_data = get_table_data(filtered_df, model_details_cols)
|
| 539 |
+
community_stats_data = get_table_data(filtered_df, community_stats_cols)
|
| 540 |
+
model_mapping_data = get_table_data(filtered_df, model_mapping_cols) # Will now include 'Model Name'
|
| 541 |
+
|
| 542 |
+
# Update model choices for radar dropdown based on filtered results
|
| 543 |
+
models_list = []
|
| 544 |
+
if not filtered_df.empty and model_identifier_col in filtered_df.columns and not filtered_df[model_identifier_col].isna().all():
|
| 545 |
+
models_list = sorted(filtered_df[model_identifier_col].dropna().unique().tolist())
|
| 546 |
+
|
| 547 |
+
|
| 548 |
+
# Update bar chart
|
| 549 |
+
current_bar_metric = metric_dropdown.value if hasattr(metric_dropdown, 'value') and metric_dropdown.value in valid_bar_metrics else default_bar_metric
|
| 550 |
+
bar_plot_update = create_comparison_chart(filtered_df, current_bar_metric)
|
| 551 |
+
|
| 552 |
+
# Update radar chart multiselect choices and value
|
| 553 |
+
current_radar_models = models_multiselect.value if hasattr(models_multiselect, 'value') else []
|
| 554 |
+
# Keep selected models if they are still in the filtered list, otherwise clear/reset
|
| 555 |
+
valid_radar_selection = [m for m in current_radar_models if m in models_list][:5] # Limit to 5
|
| 556 |
+
radar_multiselect_update = gr.update(choices=models_list, value=valid_radar_selection)
|
| 557 |
+
|
| 558 |
+
# Update radar plot itself
|
| 559 |
+
# Decide whether radar compares within filtered set or globally
|
| 560 |
+
# Using df_global to compare selected models' overall profile
|
| 561 |
+
radar_plot_update = create_radar_chart(df_global, valid_radar_selection, radar_metrics)
|
| 562 |
+
|
| 563 |
+
# Clear all model card outputs
|
| 564 |
+
clear_card = gr.update(value="")
|
| 565 |
+
|
| 566 |
+
return (
|
| 567 |
+
main_metrics_data, model_details_data, community_stats_data, model_mapping_data, # Tables
|
| 568 |
+
radar_multiselect_update, # Radar dropdown update
|
| 569 |
+
bar_plot_update, radar_plot_update, # Plots
|
| 570 |
+
clear_card, clear_card, clear_card, clear_card # Clear all card outputs
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
# Function to display model details card
|
| 574 |
+
def show_model_details_card(evt: gr.SelectData, table_data: pd.DataFrame):
|
| 575 |
+
# Basic validation of event data and table data
|
| 576 |
+
if evt is None or not hasattr(evt, 'index') or not isinstance(evt.index, (list, tuple)) or not evt.index:
|
| 577 |
+
# No valid selection index
|
| 578 |
+
return gr.update(value="")
|
| 579 |
+
if table_data is None or table_data.empty:
|
| 580 |
+
# Table data is missing or empty
|
| 581 |
+
return gr.update(value="<p>Error: Table data is not available.</p>")
|
| 582 |
+
|
| 583 |
+
# Check if evt.index is within bounds
|
| 584 |
+
if not (0 <= evt.index[0] < len(table_data)):
|
| 585 |
+
print(f"Error: Row index {evt.index[0]} out of bounds for table length {len(table_data)}")
|
| 586 |
+
return gr.update(value="<p>Error: Selected row index is out of bounds.</p>")
|
| 587 |
+
|
| 588 |
+
row_idx = evt.index[0] # Get the selected row index
|
| 589 |
+
|
| 590 |
+
try:
|
| 591 |
+
# Check if the identifier column exists in the *displayed* table data
|
| 592 |
+
if model_identifier_col not in table_data.columns:
|
| 593 |
+
print(f"Error: Model identifier column '{model_identifier_col}' not found in the clicked table's data. Columns are: {table_data.columns.tolist()}")
|
| 594 |
+
return gr.update(value=f"<p>Error: Cannot identify model from this table (missing '{model_identifier_col}').</p>")
|
| 595 |
+
|
| 596 |
+
# Get the model identifier from the selected row of the *displayed* table data
|
| 597 |
+
selected_model_identifier = table_data.iloc[row_idx][model_identifier_col]
|
| 598 |
+
|
| 599 |
+
# Check if the identifier is valid (e.g., not NaN or placeholder)
|
| 600 |
+
if pd.isna(selected_model_identifier) or selected_model_identifier == "N/A":
|
| 601 |
+
print(f"Warning: Invalid model identifier '{selected_model_identifier}' selected at index {row_idx}.")
|
| 602 |
+
return gr.update(value="<p>Cannot display details for this entry (invalid identifier).</p>")
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
# Find the full data row in the *original* global dataframe
|
| 606 |
+
# This ensures we have all columns needed for the card, even if not displayed in the clicked table
|
| 607 |
+
full_model_data_row = df_global[df_global[model_identifier_col] == selected_model_identifier]
|
| 608 |
+
|
| 609 |
+
if full_model_data_row.empty:
|
| 610 |
+
print(f"Error: Could not find full details for model identifier '{selected_model_identifier}' in global data.")
|
| 611 |
+
return gr.update(value=f"<p>Error: Could not find full details for {selected_model_identifier}.</p>")
|
| 612 |
+
|
| 613 |
+
# Convert the first found row to a dictionary (handle potential duplicates, take first)
|
| 614 |
+
full_data_dict = full_model_data_row.iloc[0].to_dict()
|
| 615 |
+
|
| 616 |
+
# Create the HTML card
|
| 617 |
+
card_html = create_model_card(full_data_dict)
|
| 618 |
+
return gr.update(value=card_html)
|
| 619 |
+
|
| 620 |
+
except KeyError as e:
|
| 621 |
+
# This might happen if iloc fails or the identifier column name is wrong somehow
|
| 622 |
+
print(f"KeyError during model card generation: {e}. Identifier column: '{model_identifier_col}', Table columns: {table_data.columns.tolist()}")
|
| 623 |
+
return gr.update(value=f"<p>Error: A data key ('{e}') was not found while generating the card.</p>")
|
| 624 |
+
except Exception as e:
|
| 625 |
+
# Catch any other unexpected errors
|
| 626 |
+
import traceback
|
| 627 |
+
print(f"Unexpected error generating model card: {e}\n{traceback.format_exc()}")
|
| 628 |
+
return gr.update(value=f"<p>An unexpected error occurred: {e}</p>")
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
# Function to update bar chart when metric selection changes
|
| 632 |
+
def update_bar_chart_on_metric_change(metric, search, min_global, organization):
|
| 633 |
+
# Refilter data based on current filter settings
|
| 634 |
+
filtered_df = filter_data(df_global, search, min_global, organization)
|
| 635 |
+
chart = create_comparison_chart(filtered_df, metric)
|
| 636 |
+
return chart
|
| 637 |
+
|
| 638 |
+
# Function to update radar chart when model selection changes
|
| 639 |
+
def update_radar_chart_on_selection(selected_models, search, min_global, organization):
|
| 640 |
+
if not selected_models:
|
| 641 |
+
# Return an empty chart with a message if no models are selected
|
| 642 |
+
return go.Figure().update_layout(title="Select models to compare", polar=dict(radialaxis=dict(visible=False)))
|
| 643 |
+
# Use df_global for radar chart to compare overall model profiles
|
| 644 |
+
chart = create_radar_chart(df_global, selected_models, radar_metrics)
|
| 645 |
+
return chart
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
# --- Connect Components ---
|
| 649 |
+
filter_inputs = [search_input, min_score_slider, org_dropdown]
|
| 650 |
+
# Outputs need to include all tables, the radar multiselect, both plots, and all card outputs
|
| 651 |
+
filter_outputs = [
|
| 652 |
+
main_metrics_table, model_details_table, community_stats_table, model_mapping_table, # Tables
|
| 653 |
+
models_multiselect, # Radar dropdown
|
| 654 |
+
bar_chart, radar_chart, # Plots
|
| 655 |
+
main_metrics_card_output, model_details_card_output, community_stats_card_output, model_mapping_card_output # Card outputs
|
| 656 |
+
]
|
| 657 |
+
|
| 658 |
+
# Trigger update when any filter changes
|
| 659 |
+
search_input.submit(update_on_filter, inputs=filter_inputs, outputs=filter_outputs)
|
| 660 |
+
min_score_slider.release(update_on_filter, inputs=filter_inputs, outputs=filter_outputs)
|
| 661 |
+
org_dropdown.change(update_on_filter, inputs=filter_inputs, outputs=filter_outputs)
|
| 662 |
+
|
| 663 |
+
# Connect table selections to their respective card outputs
|
| 664 |
+
# Pass the table component itself as the second input (its value)
|
| 665 |
+
main_metrics_table.select(show_model_details_card, inputs=[main_metrics_table], outputs=[main_metrics_card_output])
|
| 666 |
+
model_details_table.select(show_model_details_card, inputs=[model_details_table], outputs=[model_details_card_output])
|
| 667 |
+
community_stats_table.select(show_model_details_card, inputs=[community_stats_table], outputs=[community_stats_card_output])
|
| 668 |
+
model_mapping_table.select(show_model_details_card, inputs=[model_mapping_table], outputs=[model_mapping_card_output]) # This should now work
|
| 669 |
+
|
| 670 |
+
|
| 671 |
+
# Connect visualization dropdowns to their update functions
|
| 672 |
+
metric_dropdown.change(
|
| 673 |
+
update_bar_chart_on_metric_change,
|
| 674 |
+
inputs=[metric_dropdown] + filter_inputs, # Pass filters for context
|
| 675 |
+
outputs=bar_chart
|
| 676 |
+
)
|
| 677 |
+
models_multiselect.change(
|
| 678 |
+
update_radar_chart_on_selection,
|
| 679 |
+
inputs=[models_multiselect] + filter_inputs, # Pass filters for context
|
| 680 |
+
outputs=radar_chart
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
# --- Initial Load ---
|
| 684 |
+
def initial_load():
|
| 685 |
+
# Prepare initial data for all tables (use fillna)
|
| 686 |
+
initial_main_metrics = df_global[main_metrics_cols].fillna("N/A") if main_metrics_cols else pd.DataFrame(columns=main_metrics_cols)
|
| 687 |
+
initial_model_details = df_global[model_details_cols].fillna("N/A") if model_details_cols else pd.DataFrame(columns=model_details_cols)
|
| 688 |
+
initial_community_stats = df_global[community_stats_cols].fillna("N/A") if community_stats_cols else pd.DataFrame(columns=community_stats_cols)
|
| 689 |
+
initial_model_mapping = df_global[model_mapping_cols].fillna("N/A") if model_mapping_cols else pd.DataFrame(columns=model_mapping_cols) # Will include 'Model Name'
|
| 690 |
+
|
| 691 |
+
# Initial bar chart
|
| 692 |
+
bar_plot = create_comparison_chart(df_global, default_bar_metric)
|
| 693 |
+
|
| 694 |
+
# Get initial choices for the multiselect dropdown
|
| 695 |
+
initial_model_choices = []
|
| 696 |
+
if model_identifier_col in df_global.columns and not df_global[model_identifier_col].isna().all():
|
| 697 |
+
initial_model_choices = sorted(df_global[model_identifier_col].dropna().unique().tolist())
|
| 698 |
+
|
| 699 |
+
|
| 700 |
+
# Initial radar chart (using default model selection determined earlier)
|
| 701 |
+
radar_plot = create_radar_chart(df_global, default_radar_models, radar_metrics)
|
| 702 |
+
|
| 703 |
+
# Return initial state for all components that need loading
|
| 704 |
+
# Ensure order matches load_outputs
|
| 705 |
+
return (
|
| 706 |
+
initial_main_metrics, initial_model_details, initial_community_stats, initial_model_mapping, # Tables
|
| 707 |
+
bar_plot, # Bar Chart
|
| 708 |
+
gr.update(value=default_radar_models, choices=initial_model_choices), # Update radar dropdown value AND choices
|
| 709 |
+
radar_plot # Radar Chart
|
| 710 |
+
)
|
| 711 |
+
|
| 712 |
+
# Define outputs for the load function - ensure order matches initial_load return tuple
|
| 713 |
+
load_outputs = [
|
| 714 |
+
main_metrics_table, model_details_table, community_stats_table, model_mapping_table, # Tables
|
| 715 |
+
bar_chart, # Bar Chart
|
| 716 |
+
models_multiselect, # Radar dropdown (to set initial value and choices)
|
| 717 |
+
radar_chart # Radar Chart
|
| 718 |
+
]
|
| 719 |
+
app.load(initial_load, inputs=[], outputs=load_outputs)
|
| 720 |
+
|
| 721 |
+
return app
|
| 722 |
+
|
| 723 |
+
# Optional: If running this file directly for testing
|
| 724 |
+
# Ensure df_global is valid before launching
|
| 725 |
+
if __name__ == "__main__":
|
| 726 |
+
if df_global is not None and not df_global.empty and 'Error' not in df_global.columns:
|
| 727 |
+
ui = create_leaderboard_ui()
|
| 728 |
+
ui.launch(debug=True) # Use debug=True for easier troubleshooting
|
| 729 |
+
else:
|
| 730 |
+
# Provide more context if loading failed
|
| 731 |
+
if df_global is not None and 'Error' in df_global.columns:
|
| 732 |
+
print(f"Failed to load data: {df_global['Error'].iloc[0]}")
|
| 733 |
+
else:
|
| 734 |
+
print("Dataframe is None or empty. Cannot launch UI.")
|