import gradio as gr
import pandas as pd
import numpy as np
from io import StringIO
import os
# Read the local TSV file
df = pd.read_csv("ReliableMath.tsv", sep='\t')
print(f"Successfully loaded {len(df)} models from local file")
# Clean up the data
df = df.dropna() # Remove any rows with missing values
df.columns = df.columns.str.strip() # Remove any whitespace from column names
# Rename columns to match our expected format
df = df.rename(columns={
'model': 'Model Name',
'size': 'Size',
"prompt": "Prompt"
})
# Create size display format
df["Size_Display"] = df["Size"].apply(
lambda x: f"{x}B" if x != "???" else f"???"
)
model_types = {
"reasoning": ["deepseek-ai/DeepSeek-R1", "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "OpenAI/o3-mini"],
"instruction": ["OpenAI/GPT-4o", "deepseek-ai/DeepSeek-V3", "Qwen/Qwen2.5-Math-1.5B-Instruct", "Qwen/Qwen2.5-Math-7B-Instruct", "Qwen/Qwen3-235B-A22B", "Qwen/Qwen3-32B", "Qwen/Qwen3-14B"]
}
# Add size category for filtering
def get_size_category(size):
if size == "???":
return "???"
elif 0 < float(size) <= 5:
return "0-5B"
elif float(size) <= 10:
return "5-10B"
elif float(size) <= 20:
return "10-20B"
elif float(size) <= 40:
return "20-40B"
elif float(size) <= 80:
return "40-80B"
else:
return ">80B"
df["Size_Category"] = df["Size"].apply(get_size_category)
def filter_and_search_models(
search_query, size_ranges, sort_by, type_by, architecture_filters=None
):
"""Filter and search models based on user inputs"""
filtered_df = df.copy()
# Apply search filter
if search_query:
mask = filtered_df["Model Name"].str.contains(
search_query, case=False, na=False
)
filtered_df = filtered_df[mask]
# Apply size range filter
if size_ranges and len(size_ranges) > 0:
filtered_df = filtered_df[filtered_df["Size_Category"].isin(size_ranges)]
# Apply model type filter
if type_by and len(type_by) > 0:
filtered_dfs = []
for idx, model_type in enumerate(type_by):
filtered_dfs.append(filtered_df[filtered_df["Model Name"].isin(model_types[model_type])])
# print(filtered_dfs)
filtered_df = pd.concat(filtered_dfs)
# Apply architecture filter
if architecture_filters and len(architecture_filters) > 0:
architecture_mask = pd.Series(
[False] * len(filtered_df), index=filtered_df.index
)
for arch in architecture_filters:
if arch == "deepseek":
architecture_mask |= filtered_df["Model Name"].str.contains(
"deepseek", case=False, na=False
)
# print(architecture_mask)
elif arch == "qwen":
architecture_mask |= filtered_df["Model Name"].str.contains(
"Qwen/", case=False, na=False
)
elif arch == "openai":
architecture_mask |= filtered_df["Model Name"].str.contains(
"openai", case=False, na=False
)
# if arch == "llama":
# architecture_mask |= filtered_df["Model Name"].str.contains(
# "meta-llama", case=False, na=False
# )
# elif arch == "deepseek":
# architecture_mask |= filtered_df["Model Name"].str.contains(
# "deepseek", case=False, na=False
# )
# elif arch == "qwen":
# architecture_mask |= filtered_df["Model Name"].str.contains(
# "Qwen", case=False, na=False
# )
# elif arch == "google":
# architecture_mask |= filtered_df["Model Name"].str.contains(
# "google", case=False, na=False
# )
# elif arch == "mistral":
# architecture_mask |= filtered_df["Model Name"].str.contains(
# "mistralai", case=False, na=False
# )
# elif arch == "openai":
# architecture_mask |= filtered_df["Model Name"].str.contains(
# "openai", case=False, na=False
# )
elif arch == "others":
# Include models that don't match any of the main categories
others_mask = ~(
filtered_df["Model Name"].str.contains("meta-llama", case=False, na=False) |
filtered_df["Model Name"].str.contains("deepseek", case=False, na=False) |
filtered_df["Model Name"].str.contains("Qwen", case=False, na=False) |
filtered_df["Model Name"].str.contains("google", case=False, na=False) |
filtered_df["Model Name"].str.contains("mistralai", case=False, na=False) |
filtered_df["Model Name"].str.contains("openai", case=False, na=False)
)
architecture_mask |= others_mask
filtered_df = filtered_df[architecture_mask]
# Sort by selected metric
if sort_by in filtered_df.columns:
filtered_df = filtered_df.sort_values(sort_by, ascending=False)
# Add ranking based on the sorted metric
filtered_df = filtered_df.reset_index(drop=True)
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
# Select columns to display (including Rank and Size)
display_df = filtered_df[
[
"Rank",
"Model Name",
"Size",
"Prompt",
"Prec.Avg",
"Prud.Avg",
"Prec.(A)",
"Prud.(A)",
"Len.(A)",
"Prec.(U)",
"Prud.(U)",
"Len.(U)"
]
]
# Rename Size_Display to Size for cleaner display
display_df = display_df.rename(columns={"Size_Display": "Size"})
# Round numerical values for better display
for col in ["Prec.Avg", "Prud.Avg", "Prec.(A)", "Prud.(A)", "Prec.(U)", "Prud.(U)"]:
display_df = display_df.copy() # Create a copy to avoid SettingWithCopyWarning
display_df[col] = display_df[col].round(3) # Reduced to 3 decimal places
return display_df
def create_html_table(df):
"""Create an HTML table from the dataframe"""
html = '
'
html += '
'
# Header
html += ""
for col in df.columns:
html += f"{col} | "
html += "
"
# Body
html += ""
for _, row in df.iterrows():
# Add model family class for styling
model_name = row["Model Name"]
row_class = ""
if "meta-llama" in model_name:
row_class = "llama-row"
elif "deepseek" in model_name:
row_class = "deepseek-row"
elif "Qwen" in model_name:
row_class = "qwen-row"
elif "google" in model_name:
row_class = "google-row"
elif "mistralai" in model_name:
row_class = "mistral-row"
elif "OpenAI" in model_name:
row_class = "openai-row"
else:
row_class = "others-row"
html += f''
for i, col in enumerate(df.columns):
cell_class = ""
if i == 0: # Rank column
cell_class = "rank-cell"
elif i == 1: # Model name
cell_class = "model-cell"
elif i == 2: # Size
cell_class = "size-cell"
else: # Score columns
cell_class = "score-cell"
# Create Hugging Face link for model name
if col == "Model Name":
if "OpenAI" in model_name:
hf_url = "https://platform.openai.com/"
else:
hf_url = f"https://huggingface.co/{model_name}"
cell_content = f'{model_name}'
else:
cell_content = str(row[col])
html += f'{cell_content} | '
html += "
"
html += ""
html += "
"
html += "
"
return html
# Create the Gradio interface
with gr.Blocks(title="ReliableMath Leaderboard", theme=gr.themes.Base()) as app:
gr.Markdown("# 🏆 ReliableMath Leaderboard")
gr.Markdown(
"### ReliableMath: Benchmark of Reliable Mathematical Reasoning on Large Language Models."
)
with gr.Tabs():
with gr.TabItem("Leaderboard"):
# Top section with search and filters
with gr.Row():
# Left side - All Filters
with gr.Column(scale=1):
gr.Markdown("### 🎛️ **Filter & Sort Options**")
# Sort dropdown with modern styling
with gr.Row():
sort_dropdown = gr.Dropdown(
choices=[
("😁 Precision Score", "Prec.Avg"),
("🧐 Prudence Score", "Prud.Avg")
],
value="Prec.Avg",
label="Sort by Metric",
elem_classes="sort-dropdown-modern",
container=True,
)
# Size filters
gr.Markdown("**📏 Filter by Model Size:**")
size_checkboxes = gr.CheckboxGroup(
choices=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"],
value=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"],
label="",
elem_classes="size-filter",
container=False,
)
# Model architecture filters
gr.Markdown("**🏗️ Filter by Model Architecture:**")
architecture_checkboxes = gr.CheckboxGroup(
choices=[
("🤖 OpenAI", "openai"),
("🐧 Qwen", "qwen"),
("🐳 DeepSeek", "deepseek"),
# ("🦙 Llama", "llama"),
# ("🔷 Gemma", "google"),
# ("🌟 Mistral", "mistral"),
("🔧 Others", "others"),
],
# value=["llama", "deepseek", "qwen", "google", "mistral", "others"],
value=["openai", "qwen", "deepseek", "others"],
label="",
elem_classes="architecture-filter",
container=False,
)
# Right side - Search
with gr.Column(scale=1):
gr.Markdown("### 🔍 **Search Models**")
search_box = gr.Textbox(
label="",
placeholder="Search for a model name (e.g., Llama, Qwen, DeepSeek)...",
value="",
elem_classes="search-input",
)
# Model type filters
gr.Markdown("**🔎 Filter by Reasoning or Instruction Models:**")
type_sort = gr.CheckboxGroup(
choices=[
("🤔 reasoning", "reasoning"),
("😯 instruction", "instruction")
],
value=["reasoning", "instruction"],
label="",
elem_classes="reasoning-filter",
container=False,
)
# Model count
total_models = gr.Markdown(f"**Showing {len(df)} models**")
# Results table below filters
results_table = gr.HTML(
value=create_html_table(
filter_and_search_models(
"",
["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"],
"Prec.Avg",
["reasoning", "instruction"],
["openai", "deepseek", "qwen", "others"]
)
),
elem_id="leaderboard-table",
)
# Metric explanations at the bottom
with gr.Accordion("Metric Explanations", open=False):
gr.Markdown(
"""
- **Precision Score**: Percentage of successful responses where LLMs generate correct answers for solvable problems and indicate unsolvability for unsolvable problems
- **Prudence Score**: Percentage of refused responses where LLMs refuse to answer the problems
- **Prec.(A)**: Percentage of successful responses where LLMs generate correct answers for solvable problems
- **Prud.(A)**: Percentage of refused responses where LLMs refuse to answer the problems for solvable problems
- **Prec.(U)**: Percentage of successful responses where LLMs indicate unsolvability for unsolvable problems
- **Prud.(U)**: Percentage of refused responses where LLMs refuse to answer the problems for unsolvable problems
"""
)
with gr.TabItem("About"):
gr.Markdown(open("about.md", "r").read()
)
# Update table when filters change
def update_table(search, sizes, sort_by, type_by, arch_filters):
filtered_df = filter_and_search_models(search, sizes, sort_by, type_by, arch_filters)
model_count = f"**Showing {len(filtered_df)} models**"
return create_html_table(filtered_df), model_count
# Connect all inputs to the update function
search_box.change(
fn=update_table,
inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes],
outputs=[results_table, total_models],
)
size_checkboxes.change(
fn=update_table,
inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes],
outputs=[results_table, total_models],
)
sort_dropdown.change(
fn=update_table,
inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes],
outputs=[results_table, total_models],
)
type_sort.change(
fn=update_table,
inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes],
outputs=[results_table, total_models],
)
architecture_checkboxes.change(
fn=update_table,
inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes],
outputs=[results_table, total_models],
)
# Add custom CSS for better styling
app.css = """
.leaderboard-container {
margin-top: 20px;
max-height: 600px;
overflow-y: auto;
border-radius: 8px;
border: 1px solid #e9ecef;
}
.leaderboard-table {
width: 100%;
border-collapse: collapse;
font-size: 14px;
background: white;
}
.leaderboard-table th {
background-color: #f8f9fa;
font-weight: 600;
padding: 12px 8px;
text-align: center;
border-bottom: 2px solid #dee2e6;
position: sticky;
top: 0;
z-index: 10;
}
.leaderboard-table th:first-child {
width: 60px;
}
.leaderboard-table td {
padding: 10px 8px;
border-bottom: 1px solid #f1f3f4;
}
.leaderboard-table tbody tr:hover {
background-color: #f8f9fa;
}
.rank-cell {
text-align: center;
font-weight: 600;
color: #444;
background-color: #f8f9fa;
width: 60px;
}
.model-cell {
font-weight: 500;
max-width: 400px;
word-wrap: break-word;
}
.model-link {
color: #0066cc !important;
text-decoration: none !important;
font-weight: 500 !important;
transition: all 0.2s ease !important;
border-bottom: 1px solid transparent !important;
}
.model-link:hover {
color: #0052a3 !important;
border-bottom: 1px solid #0066cc !important;
background-color: rgba(0, 102, 204, 0.05) !important;
padding: 2px 4px !important;
border-radius: 4px !important;
margin: -2px -4px !important;
}
.size-cell {
text-align: center;
font-weight: 500;
color: #666;
min-width: 60px;
}
.score-cell {
text-align: center;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 13px;
}
/* Model family row styling */
.llama-row {
background-color: #fffbf0;
}
.llama-row:hover {
background-color: #fef7e0;
}
.deepseek-row {
background-color: #f0f8ff;
}
.deepseek-row:hover {
background-color: #e6f3ff;
}
.qwen-row {
background-color: #f5fff5;
}
.qwen-row:hover {
background-color: #eaffea;
}
.google-row {
background-color: #fff0f5;
}
.google-row:hover {
background-color: #ffe6f0;
}
.mistral-row {
background-color: #faf5ff;
}
.mistral-row:hover {
background-color: #f3e8ff;
}
.others-row {
background-color: #f8fafc;
}
.others-row:hover {
background-color: #f1f5f9;
}
.size-filter {
margin-top: 10px;
}
.size-filter > div {
display: flex !important;
flex-wrap: wrap !important;
gap: 8px !important;
align-items: center !important;
}
.size-filter label {
display: flex !important;
align-items: center !important;
background: #f8f9fa !important;
border: 2px solid #e9ecef !important;
border-radius: 8px !important;
padding: 8px 12px !important;
margin: 0 !important;
cursor: pointer !important;
transition: all 0.2s ease !important;
font-weight: 500 !important;
font-size: 14px !important;
color: #495057 !important;
min-width: 70px !important;
justify-content: center !important;
}
.size-filter label:hover {
background: #e9ecef !important;
border-color: #6c757d !important;
}
.size-filter input[type="checkbox"] {
display: none !important;
}
.size-filter input[type="checkbox"]:checked + span {
background: #0d6efd !important;
color: white !important;
border-color: #0d6efd !important;
}
.size-filter label:has(input[type="checkbox"]:checked) {
background: #0d6efd !important;
color: white !important;
border-color: #0d6efd !important;
box-shadow: 0 2px 4px rgba(13, 110, 253, 0.2) !important;
}
.architecture-filter {
margin-top: 10px;
}
.architecture-filter > div {
display: flex !important;
flex-wrap: wrap !important;
gap: 8px !important;
align-items: center !important;
}
.architecture-filter label {
display: flex !important;
align-items: center !important;
border-radius: 8px !important;
padding: 8px 12px !important;
margin: 0 !important;
cursor: pointer !important;
transition: all 0.2s ease !important;
font-weight: 500 !important;
font-size: 14px !important;
min-width: 140px !important;
justify-content: center !important;
border: 2px solid !important;
}
.architecture-filter label:hover {
transform: translateY(-1px);
box-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
}
.architecture-filter input[type="checkbox"] {
display: none !important;
}
/* Llama styling */
.architecture-filter label:nth-child(1) {
background: #fffbf0 !important;
border-color: #f7e6a3 !important;
color: #8b4513 !important;
}
.architecture-filter label:nth-child(1):has(input[type="checkbox"]:checked) {
background: #f4a261 !important;
border-color: #f4a261 !important;
color: white !important;
box-shadow: 0 2px 4px rgba(244, 162, 97, 0.3) !important;
}
/* DeepSeek styling */
.architecture-filter label:nth-child(2) {
background: #f0f8ff !important;
border-color: #b3d9ff !important;
color: #1e40af !important;
}
.architecture-filter label:nth-child(2):has(input[type="checkbox"]:checked) {
background: #3b82f6 !important;
border-color: #3b82f6 !important;
color: white !important;
box-shadow: 0 2px 4px rgba(59, 130, 246, 0.3) !important;
}
/* Qwen styling */
.architecture-filter label:nth-child(3) {
background: #f5fff5 !important;
border-color: #b3ffb3 !important;
color: #15803d !important;
}
.architecture-filter label:nth-child(3):has(input[type="checkbox"]:checked) {
background: #22c55e !important;
border-color: #22c55e !important;
color: white !important;
box-shadow: 0 2px 4px rgba(34, 197, 94, 0.3) !important;
}
/* Google styling */
.architecture-filter label:nth-child(4) {
background: #fff0f5 !important;
border-color: #ffb3d9 !important;
color: #be185d !important;
}
.architecture-filter label:nth-child(4):has(input[type="checkbox"]:checked) {
background: #ec4899 !important;
border-color: #ec4899 !important;
color: white !important;
box-shadow: 0 2px 4px rgba(236, 72, 153, 0.3) !important;
}
/* Mistral styling */
.architecture-filter label:nth-child(5) {
background: #faf5ff !important;
border-color: #d8b4fe !important;
color: #7c3aed !important;
}
.architecture-filter label:nth-child(5):has(input[type="checkbox"]:checked) {
background: #8b5cf6 !important;
border-color: #8b5cf6 !important;
color: white !important;
box-shadow: 0 2px 4px rgba(139, 92, 246, 0.3) !important;
}
/* Others styling */
.architecture-filter label:nth-child(6) {
background: #f8fafc !important;
border-color: #cbd5e1 !important;
color: #475569 !important;
}
.architecture-filter label:nth-child(6):has(input[type="checkbox"]:checked) {
background: #64748b !important;
border-color: #64748b !important;
color: white !important;
box-shadow: 0 2px 4px rgba(100, 116, 139, 0.3) !important;
}
/* Search and Filter Section Styling */
.search-input input {
border: 2px solid #e9ecef !important;
border-radius: 12px !important;
padding: 12px 16px !important;
font-size: 14px !important;
transition: all 0.3s ease !important;
background: linear-gradient(135deg, #f8f9fa 0%, #ffffff 100%) !important;
}
.search-input input:focus {
border-color: #6366f1 !important;
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1) !important;
background: white !important;
}
.search-input input::placeholder {
color: #6b7280 !important;
font-style: italic !important;
}
/* Modern Sort Dropdown Styling */
.sort-dropdown-modern label {
font-weight: 600 !important;
color: #374151 !important;
margin-bottom: 8px !important;
}
.sort-dropdown-modern .wrap {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
border-radius: 12px !important;
padding: 2px !important;
border: none !important;
}
.sort-dropdown-modern select {
background: white !important;
border: none !important;
border-radius: 10px !important;
padding: 12px 16px !important;
font-size: 14px !important;
font-weight: 500 !important;
color: #374151 !important;
cursor: pointer !important;
transition: all 0.3s ease !important;
box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
}
.sort-dropdown-modern select:hover {
box-shadow: 0 4px 8px rgba(0,0,0,0.15) !important;
transform: translateY(-1px) !important;
}
.sort-dropdown-modern select:focus {
outline: none !important;
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2) !important;
}
/* Section Headers */
h3 {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
-webkit-background-clip: text !important;
-webkit-text-fill-color: transparent !important;
background-clip: text !important;
margin-bottom: 12px !important;
}
/* Centered Architecture Section */
.centered-title {
text-align: center !important;
}
.centered-filter > div {
display: flex !important;
flex-wrap: wrap !important;
gap: 8px !important;
align-items: center !important;
justify-content: center !important;
}
.size-filter {
margin-top: 10px;
}
/* Dark Mode Specific Styles */
@media (prefers-color-scheme: dark) {
.leaderboard-table {
background: #1f2937 !important;
color: #f9fafb !important;
}
.leaderboard-table th {
background-color: #374151 !important;
color: #f9fafb !important;
border-bottom: 2px solid #4b5563 !important;
}
.leaderboard-table td {
color: #f9fafb !important;
border-bottom: 1px solid #374151 !important;
}
.leaderboard-table tbody tr:hover {
background-color: #374151 !important;
}
.rank-cell {
background-color: #374151 !important;
color: #f9fafb !important;
}
.model-cell {
color: #f9fafb !important;
}
.size-cell {
color: #d1d5db !important;
}
.score-cell {
color: #f9fafb !important;
}
/* Dark mode row colors with better contrast */
.llama-row {
background-color: rgba(245, 158, 11, 0.1) !important;
}
.llama-row:hover {
background-color: rgba(245, 158, 11, 0.2) !important;
}
.deepseek-row {
background-color: rgba(59, 130, 246, 0.1) !important;
}
.deepseek-row:hover {
background-color: rgba(59, 130, 246, 0.2) !important;
}
.qwen-row {
background-color: rgba(34, 197, 94, 0.1) !important;
}
.qwen-row:hover {
background-color: rgba(34, 197, 94, 0.2) !important;
}
.google-row {
background-color: rgba(236, 72, 153, 0.2) !important;
}
.google-row:hover {
background-color: rgba(236, 72, 153, 0.2) !important;
}
.mistral-row {
background-color: rgba(139, 92, 246, 0.1) !important;
}
.mistral-row:hover {
background-color: rgba(139, 92, 246, 0.2) !important;
}
.others-row {
background-color: rgba(107, 114, 128, 0.1) !important;
}
.others-row:hover {
background-color: rgba(107, 114, 128, 0.2) !important;
}
.leaderboard-container {
border: 1px solid #4b5563 !important;
}
.model-cell {
color: #f9fafb !important;
}
.model-link {
color: #60a5fa !important;
}
.model-link:hover {
color: #93c5fd !important;
border-bottom: 1px solid #60a5fa !important;
background-color: rgba(96, 165, 250, 0.1) !important;
}
.size-cell {
color: #d1d5db !important;
}
}
"""
# Launch the app
if __name__ == "__main__":
app.launch()