Yunxiang commited on
Commit
859c0e2
·
1 Parent(s): aae1bdc
Files changed (5) hide show
  1. app.py +12 -0
  2. data/merged_leaderboards.csv +64 -0
  3. requirements.txt +4 -0
  4. src/data_processing.py +154 -0
  5. src/ui.py +734 -0
app.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # Add project root to path so we can import modules
5
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
6
+
7
+ from src.ui import create_leaderboard_ui
8
+
9
+ if __name__ == "__main__":
10
+ # Create and launch the UI
11
+ app = create_leaderboard_ui()
12
+ app.launch()
data/merged_leaderboards.csv ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Name (LiveBench),Organization,Global Average,Reasoning Average,Coding Average,Mathematics Average,Data Analysis Average,Language Average,IF Average,Model Link (LiveBench),Arena Rank (No Style Control),Arena Rank (With Style Control),Model Name (Arena),Arena Score,95% Confidence Interval,# of Votes,Model License,Model Knowledge Cutoff,Model Link (Arena)
2
+ o3-2025-04-16-high,OpenAI,81.55,93.33,73.33,84.67,75.8,76.0,86.17,https://openai.com/index/introducing-o3-and-o4-mini/,,,,,,,,,
3
+ o3-2025-04-16-medium,OpenAI,79.22,91.0,72.62,80.66,73.21,73.48,84.32,https://openai.com/index/introducing-o3-and-o4-mini/,,,,,,,,,
4
+ o4-mini-2025-04-16-high,OpenAI,78.13,88.11,74.33,84.9,70.43,66.05,84.96,https://openai.com/index/introducing-o3-and-o4-mini/,,,,,,,,,
5
+ gemini-2.5-pro-exp-03-25,Google,77.43,87.53,58.09,89.16,79.89,69.31,80.59,https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/,1.0,1.0,Gemini-2.5-Pro-Exp-03-25,1439.0,+7/-5,9013.0,Proprietary,Unknown,http://aistudio.google.com/app/prompts/new_chat?model=gemini-2.5-pro-exp-03-25
6
+ o4-mini-2025-04-16-medium,OpenAI,72.75,78.47,61.81,81.02,70.96,62.41,81.83,https://openai.com/index/introducing-o3-and-o4-mini/,,,,,,,,,
7
+ o1-2024-12-17-high,OpenAI,72.18,77.47,57.14,79.28,65.47,72.15,81.55,https://openai.com/o1/,10.0,6.0,o1-2024-12-17,1350.0,+3/-3,28990.0,Proprietary,Unknown,https://openai.com/index/o1-and-new-tools-for-developers/
8
+ o3-mini-2025-01-31-high,OpenAI,71.37,74.36,65.48,76.55,70.64,56.86,84.36,https://openai.com/index/openai-o3-mini/,15.0,14.0,o3-mini-high,1325.0,+4/-4,17988.0,Proprietary,Unknown,https://platform.openai.com/docs/guides/reasoning#reasoning-effort
9
+ gemini-2.5-flash-preview-04-17,Google,71.21,73.47,58.38,81.45,75.49,59.43,79.02,https://blog.google/products/gemini/gemini-2-5-flash-preview/,2.0,4.0,Gemini-2.5-Flash-Preview-04-17,1392.0,+10/-13,3325.0,Proprietary,Unknown,http://aistudio.google.com/app/prompts/new_chat?model=gemini-2.5-flash-preview-04-17
10
+ claude-3-7-sonnet-20250219-thinking-64k,Anthropic,70.57,76.17,44.67,79.0,74.05,68.27,81.25,https://www.anthropic.com/news/claude-3-7-sonnet,18.0,6.0,Claude 3.7 Sonnet (thinking-32k),1303.0,+6/-7,8998.0,Proprietary,Unknown,https://www.anthropic.com/news/claude-3-7-sonnet
11
+ grok-3-mini-beta-high,xAI,68.33,87.61,39.71,77.0,67.87,59.09,78.7,https://x.ai/blog/grok-3,2.0,4.0,Grok-3-Preview-02-24,1402.0,+5/-3,14849.0,Proprietary,Unknown,https://x.ai/blog/grok-3
12
+ deepseek-r1,DeepSeek,67.47,76.58,48.48,77.91,67.6,54.77,79.49,https://huggingface.co/deepseek-ai/DeepSeek-R1,9.0,6.0,DeepSeek-R1,1358.0,+4/-5,16077.0,MIT,Unknown,https://api-docs.deepseek.com/news/news250120
13
+ o3-mini-2025-01-31-medium,OpenAI,67.16,69.0,58.43,71.68,66.56,54.12,83.16,https://openai.com/index/openai-o3-mini/,18.0,20.0,o3-mini,1305.0,+3/-4,24877.0,Proprietary,Unknown,https://openai.com/index/openai-o3-mini/
14
+ qwq-32b,Alibaba,65.69,76.72,43.0,76.08,65.03,51.48,81.83,https://qwenlm.github.io/blog/qwq-32b/,15.0,21.0,QwQ-32B,1316.0,+6/-6,7735.0,Apache 2.0,Unknown,https://huggingface.co/Qwen/QwQ-32B
15
+ gpt-4.5-preview-2025-02-27,OpenAI,62.13,54.42,49.0,67.94,64.33,64.76,72.33,https://openai.com/index/introducing-gpt-4-5/,2.0,2.0,GPT-4.5-Preview,1398.0,+5/-6,14520.0,Proprietary,Unknown,https://openai.com/index/introducing-gpt-4-5/
16
+ gemini-2.0-flash-thinking-exp-01-21,Google,62.05,61.5,35.71,74.81,69.37,48.43,82.47,https://ai.google.dev/gemini-api/docs/thinking-mode,5.0,9.0,Gemini-2.0-Flash-Thinking-Exp-01-21,1380.0,+4/-3,26309.0,Proprietary,Unknown,https://aistudio.google.com/prompts/new_chat?model=gemini-2.0-flash-thinking-exp-01-21
17
+ gemini-2.0-pro-exp-02-05,Google,61.59,61.75,35.33,68.54,68.02,52.5,83.38,https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/,5.0,5.0,Gemini-2.0-Pro-Exp-02-05,1380.0,+5/-4,20127.0,Proprietary,Unknown,https://aistudio.google.com/prompts/new_chat?model=gemini-2.0-pro-exp-02-05
18
+ o3-mini-2025-01-31-low,OpenAI,59.76,55.94,50.76,61.67,62.04,48.07,80.06,https://openai.com/index/openai-o3-mini/,,,,,,,,,
19
+ gpt-4.1-2025-04-14,OpenAI,58.41,44.39,42.95,62.39,69.13,54.55,77.05,https://openai.com/index/gpt-4-1/,,,,,,,,,
20
+ claude-3-7-sonnet-20250219-base,Anthropic,58.21,49.11,32.43,64.65,63.37,63.19,76.49,https://www.anthropic.com/news/claude-3-7-sonnet,24.0,11.0,Claude 3.7 Sonnet,1293.0,+4/-5,14287.0,Proprietary,Unknown,https://www.anthropic.com/news/claude-3-7-sonnet
21
+ deepseek-v3-0324,DeepSeek,57.48,44.28,40.52,71.44,60.37,46.82,81.47,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,5.0,4.0,DeepSeek-V3-0324,1372.0,+9/-6,5888.0,MIT,Unknown,https://api-docs.deepseek.com/news/news250325
22
+ grok-3-beta,xAI,56.95,48.53,37.33,62.75,54.53,53.8,84.74,https://x.ai/blog/grok-3,2.0,4.0,Grok-3-Preview-02-24,1402.0,+5/-3,14849.0,Proprietary,Unknown,https://x.ai/blog/grok-3
23
+ chatgpt-4o-latest-2025-03-27,OpenAI,55.84,48.81,38.67,55.72,70.47,49.43,71.92,https://x.com/OpenAIDevs/status/1905335104211185999?t=pmYR2_xGFyWs1xOGuNxRsw&s=19,2.0,2.0,ChatGPT-4o-latest (2025-03-26),1407.0,+6/-6,8261.0,Proprietary,Unknown,https://x.com/OpenAI/status/1905331956856050135
24
+ gpt-4.1-mini-2025-04-14,OpenAI,55.55,53.78,47.57,58.78,64.87,38.0,70.31,https://openai.com/index/gpt-4-1/,,,,,,,,,
25
+ qwen2.5-max,Alibaba,55.14,38.53,33.81,56.87,67.93,58.37,75.35,https://qwenlm.github.io/blog/qwen2.5-max/,12.0,14.0,Qwen2.5-Max,1340.0,+4/-4,21169.0,Proprietary,Unknown,https://qwenlm.github.io/blog/qwen2.5-max/
26
+ gemini-2.0-flash-001,Google,54.89,44.25,26.19,63.19,67.55,42.39,85.79,https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/,9.0,14.0,Gemini-2.0-Flash-001,1354.0,+3/-4,22201.0,Proprietary,Unknown,https://aistudio.google.com/app/prompts/new_chat?instructions=lmsys-1121&model=gemini-2.0-flash-001
27
+ deepseek-r1-distill-llama-70b,DeepSeek,54.69,59.81,46.62,58.8,55.93,37.05,69.94,https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B,,,,,,,,,
28
+ llama-4-maverick-17b-128e-instruct,Meta,54.38,43.83,37.43,60.58,59.03,49.65,75.75,https://ai.meta.com/blog/llama-4-multimodal-intelligence/,35.0,24.0,Llama-4-Maverick-17B-128E-Instruct,1271.0,+7/-8,4917.0,Llama 4,Unknown,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
29
+ o1-mini-2024-09-12,OpenAI,53.43,51.33,41.0,60.26,57.92,44.66,65.4,https://platform.openai.com/docs/guides/reasoning,18.0,24.0,o1-mini,1304.0,+3/-2,54960.0,Proprietary,2023/10,https://platform.openai.com/docs/models/o1
30
+ hunyuan-turbos-20250313,Tencent,50.83,38.22,23.24,57.47,75.49,34.46,76.13,https://cloud.tencent.com/document/product/1729/104753,17.0,19.0,Hunyuan-TurboS-20250226,1302.0,+8/-10,2452.0,Proprietary,Unknown,https://cloud.tencent.com/document/product/1729/104753
31
+ claude-3-5-sonnet-20241022,Anthropic,50.81,43.22,32.29,50.54,55.03,54.48,69.3,https://www.anthropic.com/news/3-5-models-and-computer-use,31.0,14.0,Claude 3.5 Sonnet (20241022),1283.0,+2/-2,64670.0,Proprietary,2024/4,https://www.anthropic.com/news/3-5-models-and-computer-use
32
+ step-2-16k-202411,StepFun,49.86,42.39,31.09,43.68,63.72,38.41,79.88,https://www.stepfun.com/#step2,17.0,24.0,Step-2-16K-Exp,1304.0,+8/-8,5128.0,Proprietary,Unknown,https://platform.stepfun.com/docs/llm/text
33
+ gpt-4o-2024-08-06,OpenAI,49.21,42.58,31.81,45.72,60.91,45.63,68.58,https://openai.com/index/hello-gpt-4o/,41.0,29.0,GPT-4o-2024-08-06,1265.0,+2/-2,47982.0,Proprietary,2023/10,https://platform.openai.com/docs/models/gpt-4o
34
+ deepseek-r1-distill-qwen-32b,DeepSeek,48.14,44.36,52.33,60.13,45.41,30.92,55.71,https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,,,,,,,,,
35
+ grok-2-1212,xAI,48.11,36.72,26.14,55.94,54.45,45.79,69.63,https://x.ai/blog/grok-1212,28.0,29.0,Grok-2-08-13,1288.0,+2/-2,67102.0,Proprietary,2024/3,https://x.ai/blog/grok-2
36
+ gemini-2.0-flash-lite-001,Google,47.77,32.25,23.38,54.97,65.45,33.94,76.63,https://developers.googleblog.com/en/start-building-with-the-gemini-2-0-flash-family/,16.0,20.0,Gemini-2.0-Flash-Lite,1311.0,+5/-5,22122.0,Proprietary,Unknown,https://aistudio.google.com/prompts/new_chat?model=gemini-2.0-flash-lite
37
+ meta-llama-3.1-405b-instruct-turbo,Meta,47.54,40.58,28.81,40.49,55.85,43.58,75.9,https://www.together.ai/blog/meta-llama-3-1,,,,,,,,,
38
+ learnlm-1.5-pro-experimental,Google,47.49,34.86,32.38,56.71,54.97,37.86,68.16,https://ai.google.dev/gemini-api/docs/learnlm,,,,,,,,,
39
+ gpt-4o-2024-11-20,OpenAI,47.03,39.75,35.19,41.48,56.15,44.68,64.94,https://openai.com/index/hello-gpt-4o/,41.0,29.0,GPT-4o-2024-08-06,1265.0,+2/-2,47982.0,Proprietary,2023/10,https://platform.openai.com/docs/models/gpt-4o
40
+ gemma-3-27b-it,Google,46.63,34.42,25.43,52.27,51.45,41.31,74.9,https://blog.google/technology/developers/gemma-3/,11.0,14.0,Gemma-3-27B-it,1342.0,+5/-6,9976.0,Gemma,Unknown,http://aistudio.google.com/app/prompts/new_chat?model=gemma-3-27b-it
41
+ llama-3.3-70b-instruct-turbo,Meta,45.68,32.53,24.05,41.4,49.49,43.97,82.67,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,51.0,42.0,Llama-3.3-70B-Instruct,1257.0,+2/-3,38101.0,Llama-3.3,Unknown,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
42
+ claude-3-opus-20240229,Anthropic,45.61,32.03,23.33,42.93,57.89,53.57,63.89,https://www.anthropic.com/claude,57.0,41.0,Claude 3 Opus,1247.0,+2/-2,202697.0,Proprietary,2023/8,https://www.anthropic.com/news/claude-3-family
43
+ command-a-03-2025,Cohere,45.33,36.33,20.43,45.54,50.06,36.7,82.9,https://cohere.com/blog/command-a,18.0,20.0,Command A (03-2025),1303.0,+7/-6,7547.0,CC-BY-NC-4.0,Unknown,https://cohere.com/blog/command-a
44
+ dracarys2-72b-instruct,AbacusAI,44.83,37.49,25.43,52.25,55.51,33.06,65.22,https://huggingface.co/abacusai/Dracarys2-72B-Instruct,,,,,,,,,
45
+ mistral-large-2411,Mistral AI,43.59,33.83,26.95,42.2,50.15,40.45,67.93,https://huggingface.co/mistralai/Mistral-Large-Instruct-2411,56.0,57.0,Mistral-Large-2411,1249.0,+3/-3,28985.0,MRL,Unknown,https://huggingface.co/mistralai/Mistral-Large-Instruct-2411
46
+ qwen2.5-72b-instruct-turbo,Alibaba,43.36,34.08,21.29,51.88,51.91,36.63,64.39,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,51.0,57.0,Qwen2.5-72B-Instruct,1257.0,+2/-3,41532.0,Qwen,2024/9,https://qwenlm.github.io/blog/qwen2.5/
47
+ dracarys2-llama-3.1-70b-instruct,AbacusAI,42.97,36.67,21.24,40.3,53.98,42.37,63.24,https://huggingface.co/abacusai/Dracarys2-Llama-3.1-70B-Instruct,,,,,,,,,
48
+ gemma-3-12b-it,Google,41.25,28.61,19.09,48.14,46.56,31.27,73.83,https://blog.google/technology/developers/gemma-3/,,,,,,,,,
49
+ mistral-small-2503,Mistral AI,40.92,37.08,21.29,38.39,50.54,34.59,63.66,https://mistral.ai/news/mistral-small-3-1,69.0,74.0,Mistral-Small-24B-Instruct-2501,1217.0,+5/-4,14573.0,Apache 2.0,Unknown,https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501
50
+ phi-4,Microsoft,40.68,39.06,29.09,43.03,45.17,29.33,58.38,https://huggingface.co/microsoft/Phi-4,77.0,84.0,Phi-4,1205.0,+4/-3,25224.0,MIT,Unknown,https://huggingface.co/microsoft/phi-4
51
+ meta-llama-3.1-70b-instruct-turbo,Meta,40.52,29.67,19.86,32.54,53.75,38.35,68.98,https://www.together.ai/blog/meta-llama-3-1,57.0,63.0,Meta-Llama-3.1-70B-Instruct,1248.0,+2/-2,58654.0,Llama 3.1 Community,2023/12,https://ai.meta.com/blog/meta-llama-3-1/
52
+ amazon.nova-pro-v1:0,Amazon,40.05,28.25,20.0,37.7,48.31,38.94,67.13,https://aws.amazon.com/ai/generative-ai/nova/,59.0,64.0,Amazon Nova Pro 1.0,1245.0,+3/-4,24285.0,Proprietary,Unknown,https://docs.aws.amazon.com/nova/latest/userguide/what-is-nova.html
53
+ gpt-4.1-nano-2025-04-14,OpenAI,39.72,35.58,25.29,42.39,46.59,30.96,57.54,https://openai.com/index/gpt-4-1/,,,,,,,,,
54
+ claude-3-5-haiku-20241022,Anthropic,38.49,26.19,19.86,34.84,48.45,39.71,61.88,https://www.anthropic.com/claude/haiku,65.0,43.0,Claude 3.5 Haiku (20241022),1237.0,+3/-3,33322.0,Propretary,Unknown,https://www.anthropic.com/news/3-5-models-and-computer-use
55
+ gpt-4o-mini-2024-07-18,OpenAI,37.63,25.64,25.48,38.05,49.96,29.88,56.8,https://openai.com/index/hello-gpt-4o/,36.0,43.0,GPT-4o-mini-2024-07-18,1272.0,+2/-2,71388.0,Proprietary,2023/10,https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/
56
+ gemini-1.5-flash-8b-001,Google,34.37,18.69,16.48,32.17,46.27,22.87,69.72,https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash-8b,70.0,83.0,Gemini-1.5-Flash-8B-001,1212.0,+3/-3,37686.0,Proprietary,Unknown,https://aistudio.google.com/app/prompts/new_chat?instructions=lmsys&model=gemini-1.5-flash-8b
57
+ amazon.nova-lite-v1:0,Amazon,33.47,32.0,15.24,34.62,37.23,27.62,54.13,https://aws.amazon.com/ai/generative-ai/nova/,69.0,80.0,Amazon Nova Lite 1.0,1217.0,+4/-4,20648.0,Proprietary,Unknown,https://docs.aws.amazon.com/nova/latest/userguide/what-is-nova.html
58
+ azerogpt,SoundAI,32.7,24.47,15.95,31.81,33.95,30.73,59.31,https://azero.soundai.com/#/dashboard,,,,,,,,,
59
+ gemma-3-4b-it,Google,30.13,19.78,11.71,31.33,39.3,15.06,63.58,https://blog.google/technology/developers/gemma-3/,,,,,,,,,
60
+ command-r-plus-08-2024,Cohere,29.86,21.64,8.19,22.82,38.06,30.86,57.61,https://docs.cohere.com/docs/models,69.0,70.0,Command R+ (08-2024),1215.0,+6/-6,10539.0,CC-BY-NC-4.0,2024/8,https://docs.cohere.com/docs/command-r-plus#model-details
61
+ qwen2.5-7b-instruct-turbo,Alibaba,29.22,22.31,11.0,36.81,35.22,17.85,52.11,https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,,,,,,,,,
62
+ amazon.nova-micro-v1:0,Amazon,28.65,25.42,6.14,34.15,33.95,24.19,48.04,https://aws.amazon.com/ai/generative-ai/nova/,82.0,93.0,Amazon Nova Micro 1.0,1198.0,+4/-4,20663.0,Proprietary,Unknown,https://docs.aws.amazon.com/nova/latest/userguide/what-is-nova.html
63
+ command-r-08-2024,Cohere,26.99,20.58,6.1,18.35,33.34,27.93,55.62,https://docs.cohere.com/docs/models,89.0,84.0,Command R (08-2024),1180.0,+5/-5,10848.0,CC-BY-NC-4.0,2024/8,https://docs.cohere.com/docs/command-r-plus#model-details
64
+ meta-llama-3.1-8b-instruct-turbo,Meta,24.95,14.78,11.05,15.08,32.82,21.1,54.9,https://www.together.ai/blog/meta-llama-3-1,91.0,109.0,Meta-Llama-3.1-8B-Instruct,1176.0,+3/-2,52597.0,Llama 3.1 Community,2023/12,https://ai.meta.com/blog/meta-llama-3-1/
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==5.25.2
2
+ pandas
3
+ plotly
4
+ numpy
src/data_processing.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/data_processing.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ import os
5
+ import re
6
+
7
+ def load_data(file_path='data/merged_leaderboards.csv'):
8
+ """Load and prepare the leaderboard data"""
9
+ try:
10
+ df = pd.read_csv(file_path) # [cite: 1]
11
+
12
+ # Replace empty strings with NaN
13
+ df = df.replace('', np.nan)
14
+
15
+ # Ensure numeric columns are properly typed
16
+ numeric_cols = [
17
+ 'Global Average', 'Reasoning Average', 'Coding Average',
18
+ 'Mathematics Average', 'Data Analysis Average',
19
+ 'Language Average', 'IF Average',
20
+ 'Arena Rank (No Style Control)', 'Arena Rank (With Style Control)',
21
+ 'Arena Score', '# of Votes'
22
+ ]
23
+
24
+ for col in numeric_cols:
25
+ if col in df.columns:
26
+ df[col] = pd.to_numeric(df[col], errors='coerce')
27
+
28
+ if '95% Confidence Interval' in df.columns:
29
+ df['95% Confidence Interval'] = df['95% Confidence Interval'].astype(str)
30
+
31
+ if 'Global Average' in df.columns:
32
+ df = df.sort_values('Global Average', ascending=False, na_position='last')
33
+
34
+ # Create Model Name (prioritize LiveBench) - Changed 'Primary Model Name' to 'Model Name'
35
+ if 'Model Name (LiveBench)' in df.columns and 'Model Name (Arena)' in df.columns:
36
+ df['Model Name'] = df['Model Name (LiveBench)'].fillna(df['Model Name (Arena)'])
37
+ elif 'Model Name (LiveBench)' in df.columns:
38
+ df['Model Name'] = df['Model Name (LiveBench)']
39
+ elif 'Model Name (Arena)' in df.columns:
40
+ df['Model Name'] = df['Model Name (Arena)']
41
+ else:
42
+ df['Model Name'] = 'Unknown Model' # Fallback
43
+
44
+ # Drop duplicates based on the primary model name to avoid issues if names clash
45
+ # Keep the first occurrence (which should be the highest ranked if sorted by Global Average)
46
+ if 'Model Name' in df.columns:
47
+ df = df.drop_duplicates(subset=['Model Name'], keep='first')
48
+
49
+
50
+ return df
51
+ except Exception as e:
52
+ print(f"Error loading data: {e}")
53
+ return pd.DataFrame()
54
+
55
+ def get_column_groups():
56
+ """Define logical column groupings for the UI, including Model Mapping"""
57
+ # Changed model_identifier to 'Model Name' to match user's working version
58
+ model_identifier = "Model Name" # This should be the unified name column
59
+
60
+ column_groups = {
61
+ "Main Metrics": [
62
+ model_identifier, "Organization", "Global Average", "Reasoning Average",
63
+ "Coding Average", "Mathematics Average", "Data Analysis Average",
64
+ "Language Average", "IF Average"
65
+ ],
66
+ "Model Details": [
67
+ model_identifier, "Organization", "Model License",
68
+ "Model Knowledge Cutoff", "Model Link (LiveBench)", "Model Link (Arena)" # Added Arena Link here too
69
+ ],
70
+ "Community Stats": [
71
+ model_identifier, "Organization",
72
+ "Arena Rank (No Style Control)", "Arena Rank (With Style Control)",
73
+ "Arena Score", "95% Confidence Interval", "# of Votes"
74
+ # Removed: "Model Name (Arena)", "Model Link (Arena)" as they are in Model Details/Mapping
75
+ ],
76
+ "Model Mapping": [
77
+ # MODIFIED: Added the main model_identifier here
78
+ model_identifier,
79
+ "Model Name (LiveBench)", "Model Name (Arena)",
80
+ "Model Link (LiveBench)", "Model Link (Arena)"
81
+ ],
82
+ # All displayable columns for card view (uses internal names where needed)
83
+ # Ensure this list is comprehensive for the card function
84
+ "All Displayable": [
85
+ model_identifier, "Organization", "Global Average", "Reasoning Average",
86
+ "Coding Average", "Mathematics Average", "Data Analysis Average",
87
+ "Language Average", "IF Average", "Model License",
88
+ "Model Knowledge Cutoff", "Model Name (LiveBench)", "Model Link (LiveBench)", # Added LiveBench Name
89
+ "Model Name (Arena)", "Model Link (Arena)", # Added Arena Name/Link
90
+ "Arena Rank (No Style Control)", "Arena Rank (With Style Control)",
91
+ "Arena Score", "95% Confidence Interval", "# of Votes"
92
+ ]
93
+ }
94
+ return column_groups
95
+
96
+ def filter_data(df, search, min_global, organization):
97
+ """Filter the dataframe based on user inputs using Model Name"""
98
+ if df is None or df.empty:
99
+ return pd.DataFrame()
100
+
101
+ filtered_df = df.copy()
102
+
103
+ # Apply minimum global average filter
104
+ if 'Global Average' in filtered_df.columns and min_global > 0:
105
+ # Ensure comparison is valid even if column has NaNs
106
+ filtered_df = filtered_df[filtered_df['Global Average'].ge(min_global)]
107
+
108
+ # Apply organization filter
109
+ if organization and organization != "All" and 'Organization' in filtered_df.columns:
110
+ # Ensure comparison handles potential NaNs in the column
111
+ filtered_df = filtered_df[filtered_df['Organization'].eq(organization) & filtered_df['Organization'].notna()]
112
+
113
+ # Apply search filter across relevant text fields
114
+ if search and search.strip(): # Ensure search is not empty or just whitespace
115
+ search_text = re.escape(search.strip().lower()) # Use strip() and escape regex chars
116
+ # Initialize filter mask
117
+ search_filter = pd.Series([False] * len(filtered_df), index=filtered_df.index)
118
+
119
+ # List of columns to search within
120
+ search_cols = [
121
+ 'Model Name', 'Organization', 'Model Name (LiveBench)', 'Model Name (Arena)'
122
+ ]
123
+
124
+ # Apply search to existing columns only, handling potential NaNs within columns
125
+ for col in search_cols:
126
+ if col in filtered_df.columns:
127
+ # Convert column to string, handle NaN, convert to lower, then check contains
128
+ search_filter |= filtered_df[col].astype(str).str.lower().str.contains(search_text, na=False)
129
+
130
+ filtered_df = filtered_df[search_filter]
131
+
132
+ return filtered_df
133
+
134
+
135
+ def get_organization_list(df):
136
+ """Get sorted list of unique organizations for dropdown"""
137
+ if df is None or 'Organization' not in df.columns:
138
+ return ["All"]
139
+ # Drop NaNs, get unique, convert to string, sort, and prepend "All"
140
+ orgs = df['Organization'].dropna().unique().tolist()
141
+ orgs = sorted([str(org) for org in orgs if org]) # Ensure sorting works on strings
142
+ return ["All"] + orgs
143
+
144
+ def get_top_models(df, column='Global Average', n=15):
145
+ """Get top N models by a specific metric, handling sort direction."""
146
+ if df is None or df.empty or column not in df.columns:
147
+ return pd.DataFrame()
148
+
149
+ # Determine sort order: ascending for ranks, descending otherwise
150
+ ascending_sort = 'Rank' in column # True if 'Rank' is in the column name
151
+
152
+ # Sort, handle NaNs in the sorting column, and take top N
153
+ # na_position='last' ensures models without a score/rank are at the bottom
154
+ return df.sort_values(column, ascending=ascending_sort, na_position='last').head(n)
src/ui.py ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/ui.py
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from src.data_processing import (
7
+ load_data, filter_data, get_organization_list,
8
+ get_column_groups, get_top_models
9
+ )
10
+ import numpy as np # Needed for scaling check
11
+
12
+ # --- Global Setup ---
13
+ try:
14
+ df_global = load_data()
15
+ if df_global.empty:
16
+ print("Warning: Dataframe is empty after loading.")
17
+ # Add Arena Score default if data loading creates an empty frame
18
+ df_global = pd.DataFrame(columns=['Model Name', 'Organization', 'Global Average', 'Arena Score'])
19
+
20
+ column_groups_global = get_column_groups()
21
+ # Consistent model identifier from data_processing
22
+ # Safely get the identifier, default to 'Model Name' if group is missing/empty
23
+ model_identifier_col = column_groups_global.get("Main Metrics", ["Model Name"])[0] if column_groups_global.get("Main Metrics") else "Model Name"
24
+
25
+
26
+ # Define base radar metrics
27
+ base_radar_metrics = [
28
+ "Reasoning Average", "Coding Average", "Mathematics Average",
29
+ "Data Analysis Average", "Language Average", "IF Average"
30
+ ]
31
+ # Add Arena Score if available in the data
32
+ if 'Arena Score' in df_global.columns:
33
+ base_radar_metrics.append('Arena Score')
34
+
35
+ # Filter to metrics actually present
36
+ radar_metrics = [m for m in base_radar_metrics if m in df_global.columns]
37
+
38
+ # Calculate global min/max for Arena Score scaling (handle NaNs and potential lack of column)
39
+ global_arena_min = np.nan
40
+ global_arena_max = np.nan
41
+ if 'Arena Score' in df_global.columns:
42
+ # Ensure the column exists before trying to access it
43
+ if not df_global['Arena Score'].isna().all(): # Check if there are any non-NaN values
44
+ global_arena_min = df_global['Arena Score'].min(skipna=True)
45
+ global_arena_max = df_global['Arena Score'].max(skipna=True)
46
+ # Handle case where min/max might still be NaN (e.g., column exists but all values are NaN)
47
+ if pd.isna(global_arena_min) or pd.isna(global_arena_max):
48
+ global_arena_min = 0 # Default scale if calculation fails
49
+ global_arena_max = 1 # Avoid division by zero later
50
+ elif global_arena_max == global_arena_min:
51
+ # Avoid division by zero if all scores are the same
52
+ global_arena_max += 1 # Or set a default range like 0-100? Adjust as needed.
53
+ else:
54
+ # Column exists but all values are NaN
55
+ global_arena_min = 0
56
+ global_arena_max = 1
57
+ else:
58
+ # Arena Score column doesn't exist
59
+ global_arena_min = 0
60
+ global_arena_max = 1
61
+
62
+
63
+ except Exception as e:
64
+ print(f"Critical error during global setup: {e}")
65
+ df_global = pd.DataFrame({'Error': [f'Failed to load data: {e}']})
66
+ column_groups_global = {}
67
+ model_identifier_col = 'Error'
68
+ radar_metrics = []
69
+ global_arena_min, global_arena_max = 0, 1 # Default values on error
70
+
71
+ # --- Helper Functions ---
72
+ def get_valid_columns(df, column_list):
73
+ if df is None or column_list is None: return []
74
+ # Return only columns that exist in the dataframe
75
+ return [col for col in column_list if col in df.columns]
76
+
77
+ # --- UI Component Creation Functions ---
78
+ # create_model_card remains the same as previous version
79
+ def create_model_card(model_data):
80
+ """Create a detailed card view for a single model"""
81
+ if not isinstance(model_data, dict): return "<p>Error: Invalid model data format.</p>"
82
+
83
+ # Use the globally defined identifier
84
+ model_name = model_data.get(model_identifier_col, 'Unknown Model')
85
+
86
+ # Helper to safely get data, handling potential NaNs or missing keys
87
+ def get_data(key, default='N/A'):
88
+ val = model_data.get(key)
89
+ # Check for NaN specifically with pandas, otherwise just check for None
90
+ return default if pd.isna(val) else val
91
+
92
+ # Determine the primary link, prioritizing LiveBench then Arena
93
+ model_link_livebench = get_data('Model Link (LiveBench)', default=None)
94
+ model_link_arena = get_data('Model Link (Arena)', default=None)
95
+ # Ensure we don't use 'N/A' as a link
96
+ model_link = None
97
+ if model_link_livebench and model_link_livebench != 'N/A':
98
+ model_link = model_link_livebench
99
+ elif model_link_arena and model_link_arena != 'N/A':
100
+ model_link = model_link_arena
101
+
102
+
103
+ # Build the card HTML structure
104
+ card_html = f"""
105
+ <div class="model-card">
106
+ <h3>{model_name}</h3>
107
+ <h4>by {get_data('Organization')}</h4>
108
+ <div class="metrics">
109
+ <div class="metric-group">
110
+ <h5>Performance Metrics</h5>
111
+ <table class="metric-table">
112
+ <tr><td>Global Average:</td><td><b>{get_data('Global Average', 'N/A')}</b></td></tr>
113
+ <tr><td>Reasoning:</td><td>{get_data('Reasoning Average', 'N/A')}</td></tr>
114
+ <tr><td>Coding:</td><td>{get_data('Coding Average', 'N/A')}</td></tr>
115
+ <tr><td>Mathematics:</td><td>{get_data('Mathematics Average', 'N/A')}</td></tr>
116
+ <tr><td>Data Analysis:</td><td>{get_data('Data Analysis Average', 'N/A')}</td></tr>
117
+ <tr><td>Language:</td><td>{get_data('Language Average', 'N/A')}</td></tr>
118
+ <tr><td>IF Average:</td><td>{get_data('IF Average', 'N/A')}</td></tr>
119
+ </table>
120
+ </div>
121
+ <div class="metric-group">
122
+ <h5>Community Data (Arena)</h5>
123
+ <table class="metric-table">
124
+ <tr><td>Rank (No Style Ctrl):</td><td>{get_data('Arena Rank (No Style Control)', 'N/A')}</td></tr>
125
+ <tr><td>Rank (Style Ctrl):</td><td>{get_data('Arena Rank (With Style Control)', 'N/A')}</td></tr>
126
+ <tr><td>Arena Score:</td><td>{get_data('Arena Score', 'N/A')}</td></tr>
127
+ <tr><td>Confidence Interval:</td><td>{get_data('95% Confidence Interval', 'N/A')}</td></tr>
128
+ <tr><td># of Votes:</td><td>{get_data('# of Votes', 'N/A')}</td></tr>
129
+ </table>
130
+ </div>
131
+ </div>
132
+ <div class="model-details">
133
+ <h5>Model Information</h5>
134
+ <table class="detail-table">
135
+ <tr><td>License:</td><td>{get_data('Model License', 'N/A')}</td></tr>
136
+ <tr><td>Knowledge Cutoff:</td><td>{get_data('Model Knowledge Cutoff', 'N/A')}</td></tr>
137
+ {f"<tr><td>LiveBench Name:</td><td>{get_data('Model Name (LiveBench)', 'N/A')}</td></tr>" if get_data('Model Name (LiveBench)', default=None) else ""}
138
+ {f"<tr><td>Arena Name:</td><td>{get_data('Model Name (Arena)', 'N/A')}</td></tr>" if get_data('Model Name (Arena)', default=None) else ""}
139
+ {f"<tr><td>LiveBench Link:</td><td><a href='{model_link_livebench}' target='_blank'>{model_link_livebench}</a></td></tr>" if model_link_livebench and model_link_livebench != 'N/A' else ""}
140
+ {f"<tr><td>Arena Link:</td><td><a href='{model_link_arena}' target='_blank'>{model_link_arena}</a></td></tr>" if model_link_arena and model_link_arena != 'N/A' else ""}
141
+ </table>
142
+ </div>
143
+ """
144
+ # Add the "Learn More" button only if a valid link was found
145
+ if model_link:
146
+ card_html += f"""<div class="model-link"><a href="{model_link}" target="_blank" rel="noopener noreferrer">Learn More</a></div>"""
147
+
148
+ card_html += """</div>""" # Close model-card div
149
+ return card_html
150
+
151
+
152
+ # create_comparison_chart remains the same as previous version
153
+ def create_comparison_chart(df, metric):
154
+ """Create a bar chart comparing models on a specific metric"""
155
+ if df is None or df.empty or metric not in df.columns:
156
+ return go.Figure().update_layout(title=f"No data available for {metric}", xaxis_title="Model", yaxis_title=metric)
157
+
158
+ # Use get_top_models which handles sorting correctly (ascending for ranks)
159
+ top_df = get_top_models(df, metric, n=15)
160
+
161
+ if top_df.empty:
162
+ return go.Figure().update_layout(title=f"No models found for {metric}", xaxis_title="Model", yaxis_title=metric)
163
+
164
+ # Determine category order based on whether it's a rank
165
+ category_order = 'total ascending' if 'Rank' in metric else 'total descending'
166
+
167
+ # --- Define columns needed for hover info ---
168
+ # Must include the x-axis identifier and any other desired fields
169
+ custom_data_potential_cols = [
170
+ model_identifier_col, # Index 0 (Model Name)
171
+ 'Organization', # Index 1
172
+ 'Arena Score', # Index 2
173
+ 'Arena Rank (No Style Control)', # Index 3
174
+ 'Global Average' # Index 4 (Example: Add Global Average)
175
+ ]
176
+ # Filter to columns that actually exist in top_df
177
+ valid_custom_data_cols = [col for col in custom_data_potential_cols if col in top_df.columns]
178
+
179
+ fig = px.bar(
180
+ top_df,
181
+ x=model_identifier_col,
182
+ y=metric,
183
+ color='Organization' if 'Organization' in top_df.columns else None,
184
+ title=f'Top 15 Models by {metric}',
185
+ labels={model_identifier_col: "Model Name", "Organization": "Organization", metric: metric}, # Dynamic Y label
186
+ custom_data=valid_custom_data_cols, # Pass only existing columns
187
+ height=500
188
+ )
189
+
190
+ fig.update_layout(
191
+ xaxis_title="Model Name",
192
+ yaxis_title=metric,
193
+ xaxis={'categoryorder': category_order},
194
+ plot_bgcolor='rgba(240, 240, 240, 0.8)',
195
+ margin=dict(l=40, r=20, t=60, b=120), # Adjusted bottom margin for tilted labels
196
+ )
197
+ fig.update_xaxes(tickangle=45) # Tilt labels for better readability
198
+
199
+ # --- Build Hovertemplate using custom_data indices ---
200
+ template_parts = []
201
+ # Helper to safely add parts to template
202
+ def add_template_part(col_name, display_name, formatting=""):
203
+ if col_name in valid_custom_data_cols:
204
+ try:
205
+ idx = valid_custom_data_cols.index(col_name)
206
+ template_parts.append(f"<b>{display_name}:</b> %{{customdata[{idx}]{formatting}}}<br>")
207
+ except (ValueError, IndexError): pass # Should not happen if check passes
208
+
209
+ # Add parts based on valid columns
210
+ add_template_part(model_identifier_col, "Model")
211
+ add_template_part('Organization', "Organization")
212
+ template_parts.append(f"<b>{metric}:</b> %{{y:,.2f}}<br>") # Always show the main metric from y-axis
213
+
214
+ # Add other metrics if they exist and are not the main metric being plotted
215
+ if 'Global Average' in valid_custom_data_cols and 'Global Average' != metric:
216
+ add_template_part('Global Average', "Global Avg", ":,.2f")
217
+ if 'Arena Score' in valid_custom_data_cols and 'Arena Score' != metric:
218
+ add_template_part('Arena Score', "Arena Score", ":,.0f")
219
+ if 'Arena Rank (No Style Control)' in valid_custom_data_cols and 'Arena Rank (No Style Control)' != metric:
220
+ add_template_part('Arena Rank (No Style Control)', "Arena Rank", ":,.0f")
221
+
222
+ template_parts.append("<extra></extra>") # Remove trace info
223
+ hovertemplate = "".join(template_parts)
224
+
225
+ # Apply the hovertemplate
226
+ fig.update_traces(hovertemplate=hovertemplate)
227
+
228
+ return fig
229
+
230
+
231
+ # create_radar_chart remains the same as previous version
232
+ def create_radar_chart(df, model_names, metrics):
233
+ """Create a radar chart comparing multiple models across metrics, including scaled Arena Score"""
234
+ # Input validation
235
+ if df is None or df.empty or not model_names or not metrics:
236
+ return go.Figure().update_layout(title="Select models and ensure metrics exist", polar=dict(radialaxis=dict(visible=False)))
237
+
238
+ # Ensure metrics requested are valid and exist in the dataframe
239
+ valid_metrics = [m for m in metrics if m in df.columns]
240
+ if not valid_metrics:
241
+ return go.Figure().update_layout(title="None of the selected metrics exist in the data", polar=dict(radialaxis=dict(visible=False)))
242
+
243
+ # Filter data for selected models
244
+ # Use .loc to avoid SettingWithCopyWarning if modifications are made later
245
+ filtered_df = df.loc[df[model_identifier_col].isin(model_names)].copy()
246
+ if filtered_df.empty:
247
+ return go.Figure().update_layout(title="Selected models not found in data", polar=dict(radialaxis=dict(visible=False)))
248
+
249
+ fig = go.Figure()
250
+ radar_plot_metrics = [] # Final list of metric labels for the chart axes
251
+
252
+ # Process each selected model
253
+ processed_models_count = 0 # Keep track of models successfully added
254
+ for _, row in filtered_df.iterrows():
255
+ model_values = []
256
+ current_radar_metrics = [] # Track metrics used for this model row
257
+
258
+ for metric in valid_metrics: # Use only valid metrics
259
+ metric_label = metric # Default label
260
+ value = 0 # Default value if missing or invalid
261
+
262
+ if metric == 'Arena Score':
263
+ metric_label = 'Arena Score (Scaled)' # Use distinct label
264
+ if 'Arena Score' in row and pd.notna(row['Arena Score']):
265
+ score = row['Arena Score']
266
+ # Scale score using global min/max (check for division by zero)
267
+ if global_arena_max > global_arena_min:
268
+ scaled_score = ((score - global_arena_min) / (global_arena_max - global_arena_min)) * 100
269
+ value = np.clip(scaled_score, 0, 100) # Clip to 0-100 range
270
+ else:
271
+ value = 50 # Neutral value if scaling is not possible
272
+ # else: value remains 0 (default for missing Arena Score)
273
+ elif metric in row and pd.notna(row[metric]):
274
+ # Standard metric handling
275
+ numeric_val = pd.to_numeric(row[metric], errors='coerce')
276
+ if pd.notna(numeric_val):
277
+ value = np.clip(numeric_val, 0, 100) # Clip other metrics to 0-100
278
+ # else: value remains 0 (default for non-numeric or NaN)
279
+
280
+ # Append value and the corresponding label
281
+ model_values.append(value)
282
+ current_radar_metrics.append(metric_label)
283
+
284
+ # Use the metrics from the first processed model as the canonical set for theta
285
+ if not radar_plot_metrics:
286
+ radar_plot_metrics = current_radar_metrics
287
+
288
+ # Add trace for this model if data was processed and matches expected metrics
289
+ if model_values and len(model_values) == len(radar_plot_metrics):
290
+ r_values = model_values + [model_values[0]] # Close the polygon
291
+ theta_values = radar_plot_metrics + [radar_plot_metrics[0]] # Close theta polygon
292
+
293
+ fig.add_trace(go.Scatterpolar(
294
+ r=r_values,
295
+ theta=theta_values,
296
+ fill='toself',
297
+ name=row[model_identifier_col], # Model name for the legend
298
+ hoverinfo='text', # Use custom hover text
299
+ # Define hover text for each point on the radar
300
+ text=[f"{metric_label}: {val:.2f}" for metric_label, val in zip(radar_plot_metrics, model_values)] + [f"{radar_plot_metrics[0]}: {model_values[0]:.2f}"] # Text for hover
301
+ ))
302
+ processed_models_count += 1
303
+
304
+
305
+ # Final figure layout adjustments
306
+ if processed_models_count == 0: # If no traces were added
307
+ return go.Figure().update_layout(title="No data to display for selected models/metrics", polar=dict(radialaxis=dict(visible=False)))
308
+
309
+ fig.update_layout(
310
+ polar=dict(
311
+ radialaxis=dict(
312
+ visible=True,
313
+ range=[0, 100] # Fixed range due to clipping/scaling
314
+ )
315
+ ),
316
+ showlegend=True,
317
+ legend_title_text='Model Name',
318
+ title="Model Comparison Across Metrics",
319
+ height=600
320
+ )
321
+
322
+ return fig
323
+
324
+
325
+ # --- Main UI Function ---
326
+
327
+ def create_leaderboard_ui():
328
+ """Create the main Gradio UI"""
329
+ css = """
330
+ /* General Layout & Styling */
331
+ .gradio-container { max-width: 95% !important; margin: 0 auto !important;}
332
+ .container { max-width: none; margin: 0 auto; padding: 0 15px; }
333
+ .header { text-align: center; margin-bottom: 1rem; } /* Reduced margin */
334
+ .intro-text { max-width: 800px; margin: 0 auto 2rem auto; padding: 15px; background-color: #f8f9fa; border-radius: 8px; text-align: left; font-size: 0.95em; line-height: 1.6; border: 1px solid #e9ecef; }
335
+ .intro-text h4 { margin-top: 0; margin-bottom: 10px; color: #4a69bd; }
336
+ .intro-text p { margin-bottom: 10px; }
337
+ .intro-text ul { margin-left: 20px; margin-bottom: 10px; }
338
+ .intro-text a { color: #3b5998; text-decoration: none; }
339
+ .intro-text a:hover { text-decoration: underline; }
340
+
341
+
342
+ /* Filter Container */
343
+ .filter-container { background-color: #f5f7fa; border-radius: 10px; padding: 15px; margin-bottom: 20px; box-shadow: 0 2px 5px rgba(0,0,0,0.05); }
344
+
345
+ /* Model Card Styling */
346
+ .model-card { background: white; border-radius: 8px; padding: 20px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); margin-bottom: 20px; border: 1px solid #eee; }
347
+ .model-card h3 { margin-top: 0; color: #333; }
348
+ .model-card h4 { margin-bottom: 15px; color: #555; font-weight: normal; }
349
+ .metrics { display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 20px; margin: 15px 0; }
350
+ .metric-group h5 { margin-bottom: 10px; border-bottom: 1px solid #eee; padding-bottom: 5px; color: #4a69bd; font-size: 1em; }
351
+ .metric-table, .detail-table { width: 100%; border-collapse: collapse; font-size: 0.9em; }
352
+ .metric-table td, .detail-table td { padding: 5px 0; vertical-align: top; }
353
+ .metric-table td:first-child { color: #555; width: 60%; }
354
+ .metric-table td:last-child { font-weight: bold; text-align: right; }
355
+ .detail-table td:first-child { color: #555; width: 30%;}
356
+ .detail-table td:last-child { width: 70%; word-wrap: break-word; } /* Allow long details to wrap */
357
+ .model-details h5 { margin-top: 20px; margin-bottom: 10px; border-bottom: 1px solid #eee; padding-bottom: 5px; color: #4a69bd; font-size: 1em; }
358
+ .model-link { margin-top: 15px; text-align: right; }
359
+ .model-link a { display: inline-block; background: #4a69bd; color: white !important; padding: 8px 16px; border-radius: 4px; text-decoration: none; font-weight: bold; font-size: 0.9em; transition: background-color 0.2s ease; }
360
+ .model-link a:hover { background: #3b549a; }
361
+
362
+ /* DataFrame Table Styling */
363
+ .gradio-dataframe { overflow-x: auto; } /* Add horizontal scroll if needed */
364
+ .gradio-dataframe table.dataframe { table-layout: auto; width: 100%; border-collapse: collapse; border: 1px solid #ddd; font-size: 0.9em; }
365
+ .gradio-dataframe table.dataframe th, .gradio-dataframe table.dataframe td { white-space: normal; word-wrap: break-word; padding: 8px 10px; text-align: left; vertical-align: top; }
366
+ .gradio-dataframe table.dataframe th { background-color: #f2f2f2; border-bottom: 2px solid #ddd; font-weight: bold; }
367
+ .gradio-dataframe table.dataframe td { border-bottom: 1px solid #eee; }
368
+ .gradio-dataframe table.dataframe tr:nth-child(even) { background-color: #f9f9f9; }
369
+ /* REMOVED: .gradio-dataframe table.dataframe tr:hover { background-color: #e9f7ef; } */ /* Removed hover effect */
370
+
371
+ /* Attempt to hide the multi-cell selection visual */
372
+ .gradio-dataframe table.dataframe td::selection {
373
+ background-color: transparent; /* Make selection background invisible */
374
+ color: inherit; /* Keep text color the same */
375
+ }
376
+ .gradio-dataframe table.dataframe td::-moz-selection { /* Firefox */
377
+ background-color: transparent;
378
+ color: inherit;
379
+ }
380
+
381
+
382
+ /* Plot Styling */
383
+ .gradio-plot { min-height: 400px; padding-top: 10px; }
384
+
385
+ /* Text Selection Control */
386
+ /* Allow selection everywhere by default */
387
+ body, .gradio-container {
388
+ user-select: text;
389
+ -webkit-user-select: text;
390
+ -ms-user-select: text;
391
+ }
392
+ /* Ensure dataframes and model cards specifically allow text selection */
393
+ /* This allows copying text content */
394
+ .gradio-dataframe, .model-card {
395
+ user-select: text !important;
396
+ -webkit-user-select: text !important;
397
+ -ms-user-select: text !important;
398
+ }
399
+ """
400
+
401
+ with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue)) as app:
402
+ # --- Header and Introduction ---
403
+ gr.HTML("""<div class="header"><h1>LLM Leaderboard Explorer</h1><p>Interactive visualization of merged leaderboards data</p></div>""")
404
+
405
+ # --- ADDED: Introduction Section ---
406
+ gr.Markdown("""
407
+ <div class="intro-text">
408
+ <h4>Welcome!</h4>
409
+ <p>This application provides an interactive view of combined data from two leading LLM evaluation platforms:</p>
410
+ <ul>
411
+ <li><a href="https://livebench.ai/#/" target="_blank">LiveBench</a>: ​LiveBench is a dynamic benchmark, featuring monthly updated, contamination-free tasks with objective scoring across diverse domains.</li>
412
+ <li><a href="https://huggingface.co/spaces/lmarena-ai/chatbot-arena-leaderboard" target="_blank">LMSYS Chatbot Arena</a>: Uses crowd-sourced human preferences (Elo ratings) to rank models based on conversation quality.</li>
413
+ </ul>
414
+ <h4>Key Data Points:</h4>
415
+ <ul>
416
+ <li><b>Performance Metrics (LiveBench):</b> Includes 'Global Average' score and specific capability scores like 'Reasoning', 'Coding', 'Mathematics', etc. Higher is generally better.</li>
417
+ <li><b>Community Stats (LMSYS):</b> Features 'Arena Score' (Elo rating) and corresponding Ranks. Higher scores/lower ranks are better.</li>
418
+ <li><b>Model Details:</b> Provides information like Organization, License, and Knowledge Cutoff date.</li>
419
+ </ul>
420
+ <h4>How to Use This App:</h4>
421
+ <ul>
422
+ <li><b>Filter & Search:</b> Use the controls above the tabs to search for models or filter by organization and minimum 'Global Average' score.</li>
423
+ <li><b>Explore Tabs:</b> View different slices of the data (Performance, Details, Community Stats, Mapping).</li>
424
+ <li><b>View Model Card:</b> Click on any row in the tables (except in the Visualizations tab) to see a detailed card with all metrics for that model.</li>
425
+ <li><b>Visualize & Compare:</b> Use the 'Visualizations' tab to compare top models on specific metrics (Bar Chart) or compare selected models across multiple dimensions (Radar Chart).</li>
426
+ </ul>
427
+ </div>
428
+ """, elem_classes="intro-text-container") # Added elem_classes for potential container styling if needed
429
+
430
+
431
+ # --- Filter Components ---
432
+ with gr.Row(elem_classes="filter-container"):
433
+ search_input = gr.Textbox(label="Search Models/Org", placeholder="Search...", show_label=False, scale=2)
434
+ org_choices = get_organization_list(df_global) # Get org list once
435
+ org_dropdown = gr.Dropdown(choices=org_choices, label="Organization", value="All", show_label=False, scale=1, min_width=160)
436
+ max_slider_val = 100
437
+ if 'Global Average' in df_global.columns and not df_global['Global Average'].isna().all():
438
+ # Calculate max based on data, ensuring it's at least 100
439
+ max_slider_val = max(100, df_global['Global Average'].max(skipna=True))
440
+ min_score_slider = gr.Slider(minimum=0, maximum=max_slider_val, value=0, label="Min Global Avg", step=1, scale=2, min_width=200)
441
+
442
+ # --- Tabbed Interface ---
443
+ with gr.Tabs() as tabs:
444
+ # Get column sets safely, handling potential missing groups or empty lists
445
+ main_metrics_cols = get_valid_columns(df_global, column_groups_global.get("Main Metrics", []))
446
+ model_details_cols = get_valid_columns(df_global, column_groups_global.get("Model Details", []))
447
+ community_stats_cols = get_valid_columns(df_global, column_groups_global.get("Community Stats", []))
448
+ model_mapping_cols = get_valid_columns(df_global, column_groups_global.get("Model Mapping", [])) # Now includes 'Model Name'
449
+
450
+ # Tab 1: Performance Metrics
451
+ with gr.TabItem("Performance Metrics"):
452
+ main_metrics_table = gr.DataFrame(
453
+ interactive=True, # Enable selection
454
+ wrap=True,
455
+ elem_classes="gradio-dataframe"
456
+ )
457
+ gr.Markdown("Click a row for details.")
458
+ main_metrics_card_output = gr.HTML() # Output for this tab's card
459
+
460
+ # Tab 2: Model Details
461
+ with gr.TabItem("Model Details"):
462
+ model_details_table = gr.DataFrame(
463
+ interactive=True, # Enable selection
464
+ wrap=True,
465
+ elem_classes="gradio-dataframe"
466
+ )
467
+ gr.Markdown("Click a row for details.")
468
+ model_details_card_output = gr.HTML() # Output for this tab's card
469
+
470
+ # Tab 3: Community Stats
471
+ with gr.TabItem("Community Stats"):
472
+ community_stats_table = gr.DataFrame(
473
+ interactive=True, # Enable selection
474
+ wrap=True,
475
+ elem_classes="gradio-dataframe"
476
+ )
477
+ gr.Markdown("Click a row for details.")
478
+ community_stats_card_output = gr.HTML() # Output for this tab's card
479
+
480
+ # Tab 4: Model Mapping
481
+ with gr.TabItem("Model Mapping"):
482
+ model_mapping_table = gr.DataFrame(
483
+ interactive=True, # Enable selection
484
+ wrap=True,
485
+ elem_classes="gradio-dataframe"
486
+ )
487
+ gr.Markdown("Click a row for details.")
488
+ model_mapping_card_output = gr.HTML() # Output for this tab's card
489
+
490
+ # Tab 5: Visualizations (No selection needed here)
491
+ with gr.TabItem("Visualizations"):
492
+ gr.Markdown("### Compare Top Models by Metric")
493
+ with gr.Row():
494
+ # Define potential metrics for the bar chart
495
+ perf_metrics = ["Global Average", "Reasoning Average", "Coding Average", "Mathematics Average", "Data Analysis Average", "Language Average", "IF Average"]
496
+ arena_metrics = ["Arena Score", "Arena Rank (No Style Control)", "Arena Rank (With Style Control)", "# of Votes"]
497
+ # Combine and filter based on actual columns in the dataframe
498
+ available_bar_metrics = perf_metrics + [m for m in arena_metrics if m in df_global.columns]
499
+ valid_bar_metrics = get_valid_columns(df_global, available_bar_metrics)
500
+ # Set default value safely
501
+ default_bar_metric = valid_bar_metrics[0] if valid_bar_metrics else None
502
+ metric_dropdown = gr.Dropdown(choices=valid_bar_metrics, label="Select Metric for Bar Chart", value=default_bar_metric)
503
+ bar_chart = gr.Plot(label="Top 15 Model Comparison", elem_classes="gradio-plot")
504
+
505
+ gr.Markdown("### Radar Chart Comparison")
506
+ with gr.Row():
507
+ # Get model choices from the identifier column
508
+ model_choices = []
509
+ if model_identifier_col in df_global.columns and not df_global[model_identifier_col].isna().all():
510
+ model_choices = sorted(df_global[model_identifier_col].dropna().unique().tolist())
511
+
512
+ # Determine default models for radar chart (e.g., top 3 by Global Average)
513
+ default_radar_models = []
514
+ if 'Global Average' in df_global.columns and not df_global.empty and model_identifier_col in df_global.columns:
515
+ # Ensure we don't select more models than exist and handle potential NaNs
516
+ n_top = min(3, len(df_global.dropna(subset=['Global Average', model_identifier_col])))
517
+ if n_top > 0:
518
+ default_radar_models = df_global.nlargest(n_top, 'Global Average')[model_identifier_col].tolist()
519
+
520
+
521
+ models_multiselect = gr.Dropdown(choices=model_choices, label="Select Models for Radar Chart (up to 5)", multiselect=True, max_choices=5, value=default_radar_models)
522
+ radar_chart = gr.Plot(label="Radar Comparison", elem_classes="gradio-plot")
523
+
524
+ # --- Event Handling --- (Keep this section as it was in the previous correct version) ---
525
+
526
+ # Function to update all tables and plots based on filters
527
+ def update_on_filter(search, min_global, organization):
528
+ filtered_df = filter_data(df_global, search, min_global, organization)
529
+
530
+ # Prepare data for each table (handle empty filtered results)
531
+ def get_table_data(df, cols):
532
+ # Ensure cols is a list even if None/empty
533
+ valid_cols = [c for c in cols if c in df.columns] if cols else []
534
+ return df[valid_cols].fillna("N/A") if not df.empty and valid_cols else pd.DataFrame(columns=valid_cols)
535
+
536
+
537
+ main_metrics_data = get_table_data(filtered_df, main_metrics_cols)
538
+ model_details_data = get_table_data(filtered_df, model_details_cols)
539
+ community_stats_data = get_table_data(filtered_df, community_stats_cols)
540
+ model_mapping_data = get_table_data(filtered_df, model_mapping_cols) # Will now include 'Model Name'
541
+
542
+ # Update model choices for radar dropdown based on filtered results
543
+ models_list = []
544
+ if not filtered_df.empty and model_identifier_col in filtered_df.columns and not filtered_df[model_identifier_col].isna().all():
545
+ models_list = sorted(filtered_df[model_identifier_col].dropna().unique().tolist())
546
+
547
+
548
+ # Update bar chart
549
+ current_bar_metric = metric_dropdown.value if hasattr(metric_dropdown, 'value') and metric_dropdown.value in valid_bar_metrics else default_bar_metric
550
+ bar_plot_update = create_comparison_chart(filtered_df, current_bar_metric)
551
+
552
+ # Update radar chart multiselect choices and value
553
+ current_radar_models = models_multiselect.value if hasattr(models_multiselect, 'value') else []
554
+ # Keep selected models if they are still in the filtered list, otherwise clear/reset
555
+ valid_radar_selection = [m for m in current_radar_models if m in models_list][:5] # Limit to 5
556
+ radar_multiselect_update = gr.update(choices=models_list, value=valid_radar_selection)
557
+
558
+ # Update radar plot itself
559
+ # Decide whether radar compares within filtered set or globally
560
+ # Using df_global to compare selected models' overall profile
561
+ radar_plot_update = create_radar_chart(df_global, valid_radar_selection, radar_metrics)
562
+
563
+ # Clear all model card outputs
564
+ clear_card = gr.update(value="")
565
+
566
+ return (
567
+ main_metrics_data, model_details_data, community_stats_data, model_mapping_data, # Tables
568
+ radar_multiselect_update, # Radar dropdown update
569
+ bar_plot_update, radar_plot_update, # Plots
570
+ clear_card, clear_card, clear_card, clear_card # Clear all card outputs
571
+ )
572
+
573
+ # Function to display model details card
574
+ def show_model_details_card(evt: gr.SelectData, table_data: pd.DataFrame):
575
+ # Basic validation of event data and table data
576
+ if evt is None or not hasattr(evt, 'index') or not isinstance(evt.index, (list, tuple)) or not evt.index:
577
+ # No valid selection index
578
+ return gr.update(value="")
579
+ if table_data is None or table_data.empty:
580
+ # Table data is missing or empty
581
+ return gr.update(value="<p>Error: Table data is not available.</p>")
582
+
583
+ # Check if evt.index is within bounds
584
+ if not (0 <= evt.index[0] < len(table_data)):
585
+ print(f"Error: Row index {evt.index[0]} out of bounds for table length {len(table_data)}")
586
+ return gr.update(value="<p>Error: Selected row index is out of bounds.</p>")
587
+
588
+ row_idx = evt.index[0] # Get the selected row index
589
+
590
+ try:
591
+ # Check if the identifier column exists in the *displayed* table data
592
+ if model_identifier_col not in table_data.columns:
593
+ print(f"Error: Model identifier column '{model_identifier_col}' not found in the clicked table's data. Columns are: {table_data.columns.tolist()}")
594
+ return gr.update(value=f"<p>Error: Cannot identify model from this table (missing '{model_identifier_col}').</p>")
595
+
596
+ # Get the model identifier from the selected row of the *displayed* table data
597
+ selected_model_identifier = table_data.iloc[row_idx][model_identifier_col]
598
+
599
+ # Check if the identifier is valid (e.g., not NaN or placeholder)
600
+ if pd.isna(selected_model_identifier) or selected_model_identifier == "N/A":
601
+ print(f"Warning: Invalid model identifier '{selected_model_identifier}' selected at index {row_idx}.")
602
+ return gr.update(value="<p>Cannot display details for this entry (invalid identifier).</p>")
603
+
604
+
605
+ # Find the full data row in the *original* global dataframe
606
+ # This ensures we have all columns needed for the card, even if not displayed in the clicked table
607
+ full_model_data_row = df_global[df_global[model_identifier_col] == selected_model_identifier]
608
+
609
+ if full_model_data_row.empty:
610
+ print(f"Error: Could not find full details for model identifier '{selected_model_identifier}' in global data.")
611
+ return gr.update(value=f"<p>Error: Could not find full details for {selected_model_identifier}.</p>")
612
+
613
+ # Convert the first found row to a dictionary (handle potential duplicates, take first)
614
+ full_data_dict = full_model_data_row.iloc[0].to_dict()
615
+
616
+ # Create the HTML card
617
+ card_html = create_model_card(full_data_dict)
618
+ return gr.update(value=card_html)
619
+
620
+ except KeyError as e:
621
+ # This might happen if iloc fails or the identifier column name is wrong somehow
622
+ print(f"KeyError during model card generation: {e}. Identifier column: '{model_identifier_col}', Table columns: {table_data.columns.tolist()}")
623
+ return gr.update(value=f"<p>Error: A data key ('{e}') was not found while generating the card.</p>")
624
+ except Exception as e:
625
+ # Catch any other unexpected errors
626
+ import traceback
627
+ print(f"Unexpected error generating model card: {e}\n{traceback.format_exc()}")
628
+ return gr.update(value=f"<p>An unexpected error occurred: {e}</p>")
629
+
630
+
631
+ # Function to update bar chart when metric selection changes
632
+ def update_bar_chart_on_metric_change(metric, search, min_global, organization):
633
+ # Refilter data based on current filter settings
634
+ filtered_df = filter_data(df_global, search, min_global, organization)
635
+ chart = create_comparison_chart(filtered_df, metric)
636
+ return chart
637
+
638
+ # Function to update radar chart when model selection changes
639
+ def update_radar_chart_on_selection(selected_models, search, min_global, organization):
640
+ if not selected_models:
641
+ # Return an empty chart with a message if no models are selected
642
+ return go.Figure().update_layout(title="Select models to compare", polar=dict(radialaxis=dict(visible=False)))
643
+ # Use df_global for radar chart to compare overall model profiles
644
+ chart = create_radar_chart(df_global, selected_models, radar_metrics)
645
+ return chart
646
+
647
+
648
+ # --- Connect Components ---
649
+ filter_inputs = [search_input, min_score_slider, org_dropdown]
650
+ # Outputs need to include all tables, the radar multiselect, both plots, and all card outputs
651
+ filter_outputs = [
652
+ main_metrics_table, model_details_table, community_stats_table, model_mapping_table, # Tables
653
+ models_multiselect, # Radar dropdown
654
+ bar_chart, radar_chart, # Plots
655
+ main_metrics_card_output, model_details_card_output, community_stats_card_output, model_mapping_card_output # Card outputs
656
+ ]
657
+
658
+ # Trigger update when any filter changes
659
+ search_input.submit(update_on_filter, inputs=filter_inputs, outputs=filter_outputs)
660
+ min_score_slider.release(update_on_filter, inputs=filter_inputs, outputs=filter_outputs)
661
+ org_dropdown.change(update_on_filter, inputs=filter_inputs, outputs=filter_outputs)
662
+
663
+ # Connect table selections to their respective card outputs
664
+ # Pass the table component itself as the second input (its value)
665
+ main_metrics_table.select(show_model_details_card, inputs=[main_metrics_table], outputs=[main_metrics_card_output])
666
+ model_details_table.select(show_model_details_card, inputs=[model_details_table], outputs=[model_details_card_output])
667
+ community_stats_table.select(show_model_details_card, inputs=[community_stats_table], outputs=[community_stats_card_output])
668
+ model_mapping_table.select(show_model_details_card, inputs=[model_mapping_table], outputs=[model_mapping_card_output]) # This should now work
669
+
670
+
671
+ # Connect visualization dropdowns to their update functions
672
+ metric_dropdown.change(
673
+ update_bar_chart_on_metric_change,
674
+ inputs=[metric_dropdown] + filter_inputs, # Pass filters for context
675
+ outputs=bar_chart
676
+ )
677
+ models_multiselect.change(
678
+ update_radar_chart_on_selection,
679
+ inputs=[models_multiselect] + filter_inputs, # Pass filters for context
680
+ outputs=radar_chart
681
+ )
682
+
683
+ # --- Initial Load ---
684
+ def initial_load():
685
+ # Prepare initial data for all tables (use fillna)
686
+ initial_main_metrics = df_global[main_metrics_cols].fillna("N/A") if main_metrics_cols else pd.DataFrame(columns=main_metrics_cols)
687
+ initial_model_details = df_global[model_details_cols].fillna("N/A") if model_details_cols else pd.DataFrame(columns=model_details_cols)
688
+ initial_community_stats = df_global[community_stats_cols].fillna("N/A") if community_stats_cols else pd.DataFrame(columns=community_stats_cols)
689
+ initial_model_mapping = df_global[model_mapping_cols].fillna("N/A") if model_mapping_cols else pd.DataFrame(columns=model_mapping_cols) # Will include 'Model Name'
690
+
691
+ # Initial bar chart
692
+ bar_plot = create_comparison_chart(df_global, default_bar_metric)
693
+
694
+ # Get initial choices for the multiselect dropdown
695
+ initial_model_choices = []
696
+ if model_identifier_col in df_global.columns and not df_global[model_identifier_col].isna().all():
697
+ initial_model_choices = sorted(df_global[model_identifier_col].dropna().unique().tolist())
698
+
699
+
700
+ # Initial radar chart (using default model selection determined earlier)
701
+ radar_plot = create_radar_chart(df_global, default_radar_models, radar_metrics)
702
+
703
+ # Return initial state for all components that need loading
704
+ # Ensure order matches load_outputs
705
+ return (
706
+ initial_main_metrics, initial_model_details, initial_community_stats, initial_model_mapping, # Tables
707
+ bar_plot, # Bar Chart
708
+ gr.update(value=default_radar_models, choices=initial_model_choices), # Update radar dropdown value AND choices
709
+ radar_plot # Radar Chart
710
+ )
711
+
712
+ # Define outputs for the load function - ensure order matches initial_load return tuple
713
+ load_outputs = [
714
+ main_metrics_table, model_details_table, community_stats_table, model_mapping_table, # Tables
715
+ bar_chart, # Bar Chart
716
+ models_multiselect, # Radar dropdown (to set initial value and choices)
717
+ radar_chart # Radar Chart
718
+ ]
719
+ app.load(initial_load, inputs=[], outputs=load_outputs)
720
+
721
+ return app
722
+
723
+ # Optional: If running this file directly for testing
724
+ # Ensure df_global is valid before launching
725
+ if __name__ == "__main__":
726
+ if df_global is not None and not df_global.empty and 'Error' not in df_global.columns:
727
+ ui = create_leaderboard_ui()
728
+ ui.launch(debug=True) # Use debug=True for easier troubleshooting
729
+ else:
730
+ # Provide more context if loading failed
731
+ if df_global is not None and 'Error' in df_global.columns:
732
+ print(f"Failed to load data: {df_global['Error'].iloc[0]}")
733
+ else:
734
+ print("Dataframe is None or empty. Cannot launch UI.")