import gradio as gr import pandas as pd import numpy as np from io import StringIO import os # Read the local TSV file df = pd.read_csv("ReliableMath.tsv", sep='\t') print(f"Successfully loaded {len(df)} models from local file") # Clean up the data df = df.dropna() # Remove any rows with missing values df.columns = df.columns.str.strip() # Remove any whitespace from column names # Rename columns to match our expected format df = df.rename(columns={ 'model': 'Model Name', 'size': 'Size', "prompt": "Prompt" }) # Create size display format df["Size_Display"] = df["Size"].apply( lambda x: f"{x}B" if x != "???" else f"???" ) model_types = { "reasoning": ["deepseek-ai/DeepSeek-R1", "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "OpenAI/o3-mini"], "instruction": ["OpenAI/GPT-4o", "deepseek-ai/DeepSeek-V3", "Qwen/Qwen2.5-Math-1.5B-Instruct", "Qwen/Qwen2.5-Math-7B-Instruct", "Qwen/Qwen3-235B-A22B", "Qwen/Qwen3-32B", "Qwen/Qwen3-14B"] } # Add size category for filtering def get_size_category(size): if size == "???": return "???" elif 0 < float(size) <= 5: return "0-5B" elif float(size) <= 10: return "5-10B" elif float(size) <= 20: return "10-20B" elif float(size) <= 40: return "20-40B" elif float(size) <= 80: return "40-80B" else: return ">80B" df["Size_Category"] = df["Size"].apply(get_size_category) def filter_and_search_models( search_query, size_ranges, sort_by, type_by, architecture_filters=None ): """Filter and search models based on user inputs""" filtered_df = df.copy() # Apply search filter if search_query: mask = filtered_df["Model Name"].str.contains( search_query, case=False, na=False ) filtered_df = filtered_df[mask] # Apply size range filter if size_ranges and len(size_ranges) > 0: filtered_df = filtered_df[filtered_df["Size_Category"].isin(size_ranges)] # Apply model type filter if type_by and len(type_by) > 0: filtered_dfs = [] for idx, model_type in enumerate(type_by): filtered_dfs.append(filtered_df[filtered_df["Model Name"].isin(model_types[model_type])]) # print(filtered_dfs) filtered_df = pd.concat(filtered_dfs) # Apply architecture filter if architecture_filters and len(architecture_filters) > 0: architecture_mask = pd.Series( [False] * len(filtered_df), index=filtered_df.index ) for arch in architecture_filters: if arch == "deepseek": architecture_mask |= filtered_df["Model Name"].str.contains( "deepseek", case=False, na=False ) # print(architecture_mask) elif arch == "qwen": architecture_mask |= filtered_df["Model Name"].str.contains( "Qwen/", case=False, na=False ) elif arch == "openai": architecture_mask |= filtered_df["Model Name"].str.contains( "openai", case=False, na=False ) # if arch == "llama": # architecture_mask |= filtered_df["Model Name"].str.contains( # "meta-llama", case=False, na=False # ) # elif arch == "deepseek": # architecture_mask |= filtered_df["Model Name"].str.contains( # "deepseek", case=False, na=False # ) # elif arch == "qwen": # architecture_mask |= filtered_df["Model Name"].str.contains( # "Qwen", case=False, na=False # ) # elif arch == "google": # architecture_mask |= filtered_df["Model Name"].str.contains( # "google", case=False, na=False # ) # elif arch == "mistral": # architecture_mask |= filtered_df["Model Name"].str.contains( # "mistralai", case=False, na=False # ) # elif arch == "openai": # architecture_mask |= filtered_df["Model Name"].str.contains( # "openai", case=False, na=False # ) elif arch == "others": # Include models that don't match any of the main categories others_mask = ~( filtered_df["Model Name"].str.contains("meta-llama", case=False, na=False) | filtered_df["Model Name"].str.contains("deepseek", case=False, na=False) | filtered_df["Model Name"].str.contains("Qwen", case=False, na=False) | filtered_df["Model Name"].str.contains("google", case=False, na=False) | filtered_df["Model Name"].str.contains("mistralai", case=False, na=False) | filtered_df["Model Name"].str.contains("openai", case=False, na=False) ) architecture_mask |= others_mask filtered_df = filtered_df[architecture_mask] # Sort by selected metric if sort_by in filtered_df.columns: filtered_df = filtered_df.sort_values(sort_by, ascending=False) # Add ranking based on the sorted metric filtered_df = filtered_df.reset_index(drop=True) filtered_df["Rank"] = range(1, len(filtered_df) + 1) # Select columns to display (including Rank and Size) display_df = filtered_df[ [ "Rank", "Model Name", "Size", "Prompt", "Prec.Avg", "Prud.Avg", "Prec.(A)", "Prud.(A)", "Len.(A)", "Prec.(U)", "Prud.(U)", "Len.(U)" ] ] # Rename Size_Display to Size for cleaner display display_df = display_df.rename(columns={"Size_Display": "Size"}) # Round numerical values for better display for col in ["Prec.Avg", "Prud.Avg", "Prec.(A)", "Prud.(A)", "Prec.(U)", "Prud.(U)"]: display_df = display_df.copy() # Create a copy to avoid SettingWithCopyWarning display_df[col] = display_df[col].round(3) # Reduced to 3 decimal places return display_df def create_html_table(df): """Create an HTML table from the dataframe""" html = '
' html += '' # Header html += "" for col in df.columns: html += f"" html += "" # Body html += "" for _, row in df.iterrows(): # Add model family class for styling model_name = row["Model Name"] row_class = "" if "meta-llama" in model_name: row_class = "llama-row" elif "deepseek" in model_name: row_class = "deepseek-row" elif "Qwen" in model_name: row_class = "qwen-row" elif "google" in model_name: row_class = "google-row" elif "mistralai" in model_name: row_class = "mistral-row" elif "OpenAI" in model_name: row_class = "openai-row" else: row_class = "others-row" html += f'' for i, col in enumerate(df.columns): cell_class = "" if i == 0: # Rank column cell_class = "rank-cell" elif i == 1: # Model name cell_class = "model-cell" elif i == 2: # Size cell_class = "size-cell" else: # Score columns cell_class = "score-cell" # Create Hugging Face link for model name if col == "Model Name": if "OpenAI" in model_name: hf_url = "https://platform.openai.com/" else: hf_url = f"https://huggingface.co/{model_name}" cell_content = f'{model_name}' else: cell_content = str(row[col]) html += f'' html += "" html += "" html += "
{col}
{cell_content}
" html += "
" return html # Create the Gradio interface with gr.Blocks(title="ReliableMath Leaderboard", theme=gr.themes.Base()) as app: gr.Markdown("# 🏆 ReliableMath Leaderboard") gr.Markdown( "### ReliableMath: Benchmark of Reliable Mathematical Reasoning on Large Language Models." ) with gr.Tabs(): with gr.TabItem("Leaderboard"): # Top section with search and filters with gr.Row(): # Left side - All Filters with gr.Column(scale=1): gr.Markdown("### 🎛️ **Filter & Sort Options**") # Sort dropdown with modern styling with gr.Row(): sort_dropdown = gr.Dropdown( choices=[ ("😁 Precision Score", "Prec.Avg"), ("🧐 Prudence Score", "Prud.Avg") ], value="Prec.Avg", label="Sort by Metric", elem_classes="sort-dropdown-modern", container=True, ) # Size filters gr.Markdown("**📏 Filter by Model Size:**") size_checkboxes = gr.CheckboxGroup( choices=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"], value=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"], label="", elem_classes="size-filter", container=False, ) # Model architecture filters gr.Markdown("**🏗️ Filter by Model Architecture:**") architecture_checkboxes = gr.CheckboxGroup( choices=[ ("🤖 OpenAI", "openai"), ("🐧 Qwen", "qwen"), ("🐳 DeepSeek", "deepseek"), # ("🦙 Llama", "llama"), # ("🔷 Gemma", "google"), # ("🌟 Mistral", "mistral"), ("🔧 Others", "others"), ], # value=["llama", "deepseek", "qwen", "google", "mistral", "others"], value=["openai", "qwen", "deepseek", "others"], label="", elem_classes="architecture-filter", container=False, ) # Right side - Search with gr.Column(scale=1): gr.Markdown("### 🔍 **Search Models**") search_box = gr.Textbox( label="", placeholder="Search for a model name (e.g., Llama, Qwen, DeepSeek)...", value="", elem_classes="search-input", ) # Model type filters gr.Markdown("**🔎 Filter by Reasoning or Instruction Models:**") type_sort = gr.CheckboxGroup( choices=[ ("🤔 reasoning", "reasoning"), ("😯 instruction", "instruction") ], value=["reasoning", "instruction"], label="", elem_classes="reasoning-filter", container=False, ) # Model count total_models = gr.Markdown(f"**Showing {len(df)} models**") # Results table below filters results_table = gr.HTML( value=create_html_table( filter_and_search_models( "", ["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"], "Prec.Avg", ["reasoning", "instruction"], ["openai", "deepseek", "qwen", "others"] ) ), elem_id="leaderboard-table", ) # Metric explanations at the bottom with gr.Accordion("Metric Explanations", open=False): gr.Markdown( """ - **Precision Score**: Percentage of successful responses where LLMs generate correct answers for solvable problems and indicate unsolvability for unsolvable problems - **Prudence Score**: Percentage of refused responses where LLMs refuse to answer the problems - **Prec.(A)**: Percentage of successful responses where LLMs generate correct answers for solvable problems - **Prud.(A)**: Percentage of refused responses where LLMs refuse to answer the problems for solvable problems - **Prec.(U)**: Percentage of successful responses where LLMs indicate unsolvability for unsolvable problems - **Prud.(U)**: Percentage of refused responses where LLMs refuse to answer the problems for unsolvable problems """ ) with gr.TabItem("About"): gr.Markdown(open("about.md", "r").read() ) # Update table when filters change def update_table(search, sizes, sort_by, type_by, arch_filters): filtered_df = filter_and_search_models(search, sizes, sort_by, type_by, arch_filters) model_count = f"**Showing {len(filtered_df)} models**" return create_html_table(filtered_df), model_count # Connect all inputs to the update function search_box.change( fn=update_table, inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes], outputs=[results_table, total_models], ) size_checkboxes.change( fn=update_table, inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes], outputs=[results_table, total_models], ) sort_dropdown.change( fn=update_table, inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes], outputs=[results_table, total_models], ) type_sort.change( fn=update_table, inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes], outputs=[results_table, total_models], ) architecture_checkboxes.change( fn=update_table, inputs=[search_box, size_checkboxes, sort_dropdown, type_sort, architecture_checkboxes], outputs=[results_table, total_models], ) # Add custom CSS for better styling app.css = """ .leaderboard-container { margin-top: 20px; max-height: 600px; overflow-y: auto; border-radius: 8px; border: 1px solid #e9ecef; } .leaderboard-table { width: 100%; border-collapse: collapse; font-size: 14px; background: white; } .leaderboard-table th { background-color: #f8f9fa; font-weight: 600; padding: 12px 8px; text-align: center; border-bottom: 2px solid #dee2e6; position: sticky; top: 0; z-index: 10; } .leaderboard-table th:first-child { width: 60px; } .leaderboard-table td { padding: 10px 8px; border-bottom: 1px solid #f1f3f4; } .leaderboard-table tbody tr:hover { background-color: #f8f9fa; } .rank-cell { text-align: center; font-weight: 600; color: #444; background-color: #f8f9fa; width: 60px; } .model-cell { font-weight: 500; max-width: 400px; word-wrap: break-word; } .model-link { color: #0066cc !important; text-decoration: none !important; font-weight: 500 !important; transition: all 0.2s ease !important; border-bottom: 1px solid transparent !important; } .model-link:hover { color: #0052a3 !important; border-bottom: 1px solid #0066cc !important; background-color: rgba(0, 102, 204, 0.05) !important; padding: 2px 4px !important; border-radius: 4px !important; margin: -2px -4px !important; } .size-cell { text-align: center; font-weight: 500; color: #666; min-width: 60px; } .score-cell { text-align: center; font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; font-size: 13px; } /* Model family row styling */ .llama-row { background-color: #fffbf0; } .llama-row:hover { background-color: #fef7e0; } .deepseek-row { background-color: #f0f8ff; } .deepseek-row:hover { background-color: #e6f3ff; } .qwen-row { background-color: #f5fff5; } .qwen-row:hover { background-color: #eaffea; } .google-row { background-color: #fff0f5; } .google-row:hover { background-color: #ffe6f0; } .mistral-row { background-color: #faf5ff; } .mistral-row:hover { background-color: #f3e8ff; } .others-row { background-color: #f8fafc; } .others-row:hover { background-color: #f1f5f9; } .size-filter { margin-top: 10px; } .size-filter > div { display: flex !important; flex-wrap: wrap !important; gap: 8px !important; align-items: center !important; } .size-filter label { display: flex !important; align-items: center !important; background: #f8f9fa !important; border: 2px solid #e9ecef !important; border-radius: 8px !important; padding: 8px 12px !important; margin: 0 !important; cursor: pointer !important; transition: all 0.2s ease !important; font-weight: 500 !important; font-size: 14px !important; color: #495057 !important; min-width: 70px !important; justify-content: center !important; } .size-filter label:hover { background: #e9ecef !important; border-color: #6c757d !important; } .size-filter input[type="checkbox"] { display: none !important; } .size-filter input[type="checkbox"]:checked + span { background: #0d6efd !important; color: white !important; border-color: #0d6efd !important; } .size-filter label:has(input[type="checkbox"]:checked) { background: #0d6efd !important; color: white !important; border-color: #0d6efd !important; box-shadow: 0 2px 4px rgba(13, 110, 253, 0.2) !important; } .architecture-filter { margin-top: 10px; } .architecture-filter > div { display: flex !important; flex-wrap: wrap !important; gap: 8px !important; align-items: center !important; } .architecture-filter label { display: flex !important; align-items: center !important; border-radius: 8px !important; padding: 8px 12px !important; margin: 0 !important; cursor: pointer !important; transition: all 0.2s ease !important; font-weight: 500 !important; font-size: 14px !important; min-width: 140px !important; justify-content: center !important; border: 2px solid !important; } .architecture-filter label:hover { transform: translateY(-1px); box-shadow: 0 2px 8px rgba(0,0,0,0.1) !important; } .architecture-filter input[type="checkbox"] { display: none !important; } /* Llama styling */ .architecture-filter label:nth-child(1) { background: #fffbf0 !important; border-color: #f7e6a3 !important; color: #8b4513 !important; } .architecture-filter label:nth-child(1):has(input[type="checkbox"]:checked) { background: #f4a261 !important; border-color: #f4a261 !important; color: white !important; box-shadow: 0 2px 4px rgba(244, 162, 97, 0.3) !important; } /* DeepSeek styling */ .architecture-filter label:nth-child(2) { background: #f0f8ff !important; border-color: #b3d9ff !important; color: #1e40af !important; } .architecture-filter label:nth-child(2):has(input[type="checkbox"]:checked) { background: #3b82f6 !important; border-color: #3b82f6 !important; color: white !important; box-shadow: 0 2px 4px rgba(59, 130, 246, 0.3) !important; } /* Qwen styling */ .architecture-filter label:nth-child(3) { background: #f5fff5 !important; border-color: #b3ffb3 !important; color: #15803d !important; } .architecture-filter label:nth-child(3):has(input[type="checkbox"]:checked) { background: #22c55e !important; border-color: #22c55e !important; color: white !important; box-shadow: 0 2px 4px rgba(34, 197, 94, 0.3) !important; } /* Google styling */ .architecture-filter label:nth-child(4) { background: #fff0f5 !important; border-color: #ffb3d9 !important; color: #be185d !important; } .architecture-filter label:nth-child(4):has(input[type="checkbox"]:checked) { background: #ec4899 !important; border-color: #ec4899 !important; color: white !important; box-shadow: 0 2px 4px rgba(236, 72, 153, 0.3) !important; } /* Mistral styling */ .architecture-filter label:nth-child(5) { background: #faf5ff !important; border-color: #d8b4fe !important; color: #7c3aed !important; } .architecture-filter label:nth-child(5):has(input[type="checkbox"]:checked) { background: #8b5cf6 !important; border-color: #8b5cf6 !important; color: white !important; box-shadow: 0 2px 4px rgba(139, 92, 246, 0.3) !important; } /* Others styling */ .architecture-filter label:nth-child(6) { background: #f8fafc !important; border-color: #cbd5e1 !important; color: #475569 !important; } .architecture-filter label:nth-child(6):has(input[type="checkbox"]:checked) { background: #64748b !important; border-color: #64748b !important; color: white !important; box-shadow: 0 2px 4px rgba(100, 116, 139, 0.3) !important; } /* Search and Filter Section Styling */ .search-input input { border: 2px solid #e9ecef !important; border-radius: 12px !important; padding: 12px 16px !important; font-size: 14px !important; transition: all 0.3s ease !important; background: linear-gradient(135deg, #f8f9fa 0%, #ffffff 100%) !important; } .search-input input:focus { border-color: #6366f1 !important; box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1) !important; background: white !important; } .search-input input::placeholder { color: #6b7280 !important; font-style: italic !important; } /* Modern Sort Dropdown Styling */ .sort-dropdown-modern label { font-weight: 600 !important; color: #374151 !important; margin-bottom: 8px !important; } .sort-dropdown-modern .wrap { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; border-radius: 12px !important; padding: 2px !important; border: none !important; } .sort-dropdown-modern select { background: white !important; border: none !important; border-radius: 10px !important; padding: 12px 16px !important; font-size: 14px !important; font-weight: 500 !important; color: #374151 !important; cursor: pointer !important; transition: all 0.3s ease !important; box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important; } .sort-dropdown-modern select:hover { box-shadow: 0 4px 8px rgba(0,0,0,0.15) !important; transform: translateY(-1px) !important; } .sort-dropdown-modern select:focus { outline: none !important; box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2) !important; } /* Section Headers */ h3 { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; -webkit-background-clip: text !important; -webkit-text-fill-color: transparent !important; background-clip: text !important; margin-bottom: 12px !important; } /* Centered Architecture Section */ .centered-title { text-align: center !important; } .centered-filter > div { display: flex !important; flex-wrap: wrap !important; gap: 8px !important; align-items: center !important; justify-content: center !important; } .size-filter { margin-top: 10px; } /* Dark Mode Specific Styles */ @media (prefers-color-scheme: dark) { .leaderboard-table { background: #1f2937 !important; color: #f9fafb !important; } .leaderboard-table th { background-color: #374151 !important; color: #f9fafb !important; border-bottom: 2px solid #4b5563 !important; } .leaderboard-table td { color: #f9fafb !important; border-bottom: 1px solid #374151 !important; } .leaderboard-table tbody tr:hover { background-color: #374151 !important; } .rank-cell { background-color: #374151 !important; color: #f9fafb !important; } .model-cell { color: #f9fafb !important; } .size-cell { color: #d1d5db !important; } .score-cell { color: #f9fafb !important; } /* Dark mode row colors with better contrast */ .llama-row { background-color: rgba(245, 158, 11, 0.1) !important; } .llama-row:hover { background-color: rgba(245, 158, 11, 0.2) !important; } .deepseek-row { background-color: rgba(59, 130, 246, 0.1) !important; } .deepseek-row:hover { background-color: rgba(59, 130, 246, 0.2) !important; } .qwen-row { background-color: rgba(34, 197, 94, 0.1) !important; } .qwen-row:hover { background-color: rgba(34, 197, 94, 0.2) !important; } .google-row { background-color: rgba(236, 72, 153, 0.2) !important; } .google-row:hover { background-color: rgba(236, 72, 153, 0.2) !important; } .mistral-row { background-color: rgba(139, 92, 246, 0.1) !important; } .mistral-row:hover { background-color: rgba(139, 92, 246, 0.2) !important; } .others-row { background-color: rgba(107, 114, 128, 0.1) !important; } .others-row:hover { background-color: rgba(107, 114, 128, 0.2) !important; } .leaderboard-container { border: 1px solid #4b5563 !important; } .model-cell { color: #f9fafb !important; } .model-link { color: #60a5fa !important; } .model-link:hover { color: #93c5fd !important; border-bottom: 1px solid #60a5fa !important; background-color: rgba(96, 165, 250, 0.1) !important; } .size-cell { color: #d1d5db !important; } } """ # Launch the app if __name__ == "__main__": app.launch()