Spaces:

kfoughali
/

serpent

Sleeping

App Files Files Community

kfoughali commited on Sep 6

Commit

56bd642

verified ·

1 Parent(s): 9196642

Update app.py

Browse files

Files changed (1) hide show

app.py +701 -0

app.py CHANGED Viewed

	@@ -0,0 +1,701 @@

+# app.py
+"""
+Research-grade KV cache compression benchmark application.
+RocketKV-enhanced SPG with 450x compression capability.
+FIXED: CUDA assert errors, safer default parameters, GPT-2 sequence limits.
+"""
+import gradio as gr
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datetime import datetime
+import json
+import pandas as pd
+import tempfile
+import os
+import logging
+from typing import Dict, List, Any, Tuple
+from config import (
+    CompressionConfig, CompressionType, EnhancedSPGConfig,
+    ProvingConfig, ResearchConstants, SUPPORTED_MODELS, BENCHMARK_CONFIGS
+)
+from benchmark import (
+    run_research_benchmark, export_proof_bundle, verify_proof_bundle,
+    BenchmarkMetrics
+)
+from compression import detect_model_layers
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Set style for plots
+plt.style.use('seaborn-v0_8-darkgrid')
+sns.set_palette("husl")
+# Global state for results
+current_results = {}
+def run_benchmark(model_key, compression_type, benchmark_type, dataset_subset,
+                 eval_samples, n_seeds, seq_length, generation_length,
+                 base_decay_rate, sink_tokens, recent_window,
+                 enable_adaptive, target_perplexity_delta,
+                 enable_progressive, progressive_quality_threshold,
+                 initial_compression_ratio, max_compression_ratio,
+                 sequence_compression_ratio, head_compression_ratio,
+                 head_retention_mode, magnitude_threshold_mode,
+                 min_tokens_for_stability, recent_boost_factor,
+                 fail_on_cpu):
+    """Run comprehensive benchmark with all compression methods."""
+    # Enable synchronous CUDA for debugging
+    if torch.cuda.is_available():
+        os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+    # Validate sequence length for GPT-2
+    if model_key == "gpt2" and seq_length > 1024:
+        logger.warning(f"Reducing sequence length from {seq_length} to 1024 for GPT-2")
+        seq_length = 1024
+    try:
+        # Create base configuration
+        base_config = CompressionConfig(
+            model_key=model_key,
+            compression_type=CompressionType[compression_type.upper()],
+            benchmark_type=benchmark_type,
+            benchmark_subset=dataset_subset if benchmark_type == "longbench" else None,
+            eval_samples=int(eval_samples),
+            n_seeds=int(n_seeds),
+            prefill_length=int(seq_length),
+            generation_length=int(generation_length),
+            fail_on_cpu_fallback=fail_on_cpu
+        )
+        # Configure Enhanced SPG with safer parameters
+        base_config.enhanced_spg_config = EnhancedSPGConfig(
+            base_decay_rate=float(base_decay_rate),
+            sink_tokens=int(sink_tokens),
+            recent_window=int(recent_window),
+            enable_adaptive=enable_adaptive,
+            target_perplexity_delta=float(target_perplexity_delta),
+            enable_progressive=enable_progressive,
+            quality_threshold=float(progressive_quality_threshold),
+            initial_compression_ratio=float(initial_compression_ratio),
+            max_compression_ratio=float(max_compression_ratio),
+            target_compression_ratio=float(max_compression_ratio),
+            sequence_compression_ratio=float(sequence_compression_ratio),
+            head_compression_ratio=float(head_compression_ratio),
+            head_retention_mode=head_retention_mode,
+            magnitude_threshold_mode=magnitude_threshold_mode,
+            min_tokens_for_stability=int(min_tokens_for_stability),
+            recent_boost_factor=float(recent_boost_factor),
+            enable_two_stage=True,
+            use_hybrid_sparse_attention=True,
+            use_snapkv_plus_plus=True,
+            stage1_compression_ratio=20.0,  # Safer default
+            stage2_compression_ratio=20.0   # For 400x total
+        )
+        # Store results
+        results = {}
+        model_name = base_config.model_name
+        # Run benchmark for selected compression type
+        logger.info(f"Running {compression_type} benchmark...")
+        metrics, summary, records, fingerprints = run_research_benchmark(
+            model_name, base_config
+        )
+        results[compression_type] = {
+            'metrics': metrics,
+            'summary': summary,
+            'records': records
+        }
+        # Also run NONE compression for baseline comparison
+        if compression_type != "none":
+            logger.info("Running baseline (no compression) benchmark...")
+            baseline_config = CompressionConfig(
+                model_key=model_key,
+                compression_type=CompressionType.NONE,
+                benchmark_type=benchmark_type,
+                benchmark_subset=dataset_subset if benchmark_type == "longbench" else None,
+                eval_samples=int(eval_samples),
+                n_seeds=int(n_seeds),
+                prefill_length=int(seq_length),
+                generation_length=int(generation_length),
+                fail_on_cpu_fallback=fail_on_cpu
+            )
+            try:
+                baseline_metrics, baseline_summary, baseline_records, _ = run_research_benchmark(
+                    model_name, baseline_config
+                )
+                results['none'] = {
+                    'metrics': baseline_metrics,
+                    'summary': baseline_summary,
+                    'records': baseline_records
+                }
+            except Exception as e:
+                logger.error(f"Baseline benchmark failed: {e}")
+                # Continue without baseline
+        # Store globally for export
+        global current_results
+        current_results = results
+        # Create visualizations
+        plots = create_visualizations(results, benchmark_type)
+        # Create summary text
+        summary_text = create_summary_text(results, benchmark_type)
+        # Export proof bundle
+        with tempfile.TemporaryDirectory() as tmpdir:
+            bundle_path = export_proof_bundle(
+                tmpdir, base_config, metrics, summary, records, fingerprints
+            )
+            # Verify the bundle
+            verification = verify_proof_bundle(
+                tmpdir, base_config, base_config.proving
+            )
+            verification_text = f"Proof verification: {'PASSED ✓' if verification['ok'] else 'FAILED ✗'}"
+            if not verification['ok']:
+                verification_text += f"\nFailures: {verification['failures']}"
+        return plots, summary_text, verification_text
+    except Exception as e:
+        logger.error(f"Benchmark failed: {e}", exc_info=True)
+        return [], f"Error: {str(e)}", "Verification failed due to error"
+def create_visualizations(results: Dict, benchmark_type: str) -> List:
+    """Create comprehensive visualizations from benchmark results."""
+    plots = []
+    # 1. Compression Ratio Comparison
+    fig, ax = plt.subplots(figsize=(10, 6))
+    methods = []
+    ratios = []
+    errors = []
+    for method, data in results.items():
+        if 'metrics' in data and hasattr(data['metrics'], 'compression_ratio_mean'):
+            methods.append(method.upper())
+            ratios.append(data['metrics'].compression_ratio_mean)
+            errors.append(data['metrics'].compression_ratio_std)
+    if methods:
+        bars = ax.bar(methods, ratios, yerr=errors, capsize=5)
+        ax.set_ylabel('Compression Ratio')
+        ax.set_title('KV Cache Compression Ratios')
+        ax.grid(True, alpha=0.3)
+        # Add value labels on bars
+        for bar, ratio in zip(bars, ratios):
+            height = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width()/2., height,
+                   f'{ratio:.1f}x', ha='center', va='bottom')
+    plt.tight_layout()
+    plots.append(fig)
+    # 2. Memory Usage Comparison
+    fig, ax = plt.subplots(figsize=(10, 6))
+    memories = []
+    memory_errors = []
+    for method, data in results.items():
+        if 'metrics' in data and hasattr(data['metrics'], 'kv_cache_memory_mb'):
+            memories.append(data['metrics'].kv_cache_memory_mb)
+            memory_errors.append(0)  # No std for memory in current implementation
+    if methods and memories:
+        bars = ax.bar(methods, memories, yerr=memory_errors, capsize=5, color='coral')
+        ax.set_ylabel('Memory Usage (MB)')
+        ax.set_title('KV Cache Memory Footprint')
+        ax.grid(True, alpha=0.3)
+        for bar, mem in zip(bars, memories):
+            height = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width()/2., height,
+                   f'{mem:.1f}', ha='center', va='bottom')
+    plt.tight_layout()
+    plots.append(fig)
+    # 3. Benchmark-specific metrics
+    if benchmark_type == "wikitext":
+        # Perplexity comparison
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
+        # Prefill perplexity
+        prefill_ppls = []
+        prefill_errors = []
+        gen_ppls = []
+        gen_errors = []
+        for method, data in results.items():
+            if 'metrics' in data:
+                metrics = data['metrics']
+                if hasattr(metrics, 'prefill_perplexity_mean'):
+                    prefill_ppls.append(metrics.prefill_perplexity_mean)
+                    prefill_errors.append(metrics.prefill_perplexity_std)
+                if hasattr(metrics, 'generation_perplexity_mean'):
+                    gen_ppls.append(metrics.generation_perplexity_mean)
+                    gen_errors.append(metrics.generation_perplexity_std)
+        if prefill_ppls:
+            ax1.bar(methods[:len(prefill_ppls)], prefill_ppls, yerr=prefill_errors, capsize=5, color='skyblue')
+            ax1.set_ylabel('Perplexity')
+            ax1.set_title('Prefill Perplexity')
+            ax1.grid(True, alpha=0.3)
+        if gen_ppls:
+            ax2.bar(methods[:len(gen_ppls)], gen_ppls, yerr=gen_errors, capsize=5, color='lightgreen')
+            ax2.set_ylabel('Perplexity')
+            ax2.set_title('Generation Perplexity')
+            ax2.grid(True, alpha=0.3)
+        plt.suptitle('Quality Metrics: Perplexity Comparison')
+        plt.tight_layout()
+        plots.append(fig)
+    elif benchmark_type in ["niah", "ruler", "scbench"]:
+        # Accuracy metrics
+        fig, ax = plt.subplots(figsize=(10, 6))
+        accuracies = []
+        for method, data in results.items():
+            if 'summary' in data:
+                if benchmark_type == "niah" and 'niah_accuracy' in data['summary']:
+                    accuracies.append(data['summary']['niah_accuracy'])
+                elif benchmark_type == "ruler" and 'ruler_exact_match' in data['summary']:
+                    accuracies.append(data['summary']['ruler_exact_match'])
+                elif benchmark_type == "scbench" and 'scbench_accuracy' in data['summary']:
+                    accuracies.append(data['summary']['scbench_accuracy'])
+        if accuracies:
+            bars = ax.bar(methods[:len(accuracies)], accuracies, color='gold')
+            ax.set_ylabel('Accuracy')
+            ax.set_ylim(0, 1.1)
+            ax.set_title(f'{benchmark_type.upper()} Accuracy')
+            ax.grid(True, alpha=0.3)
+            for bar, acc in zip(bars, accuracies):
+                height = bar.get_height()
+                ax.text(bar.get_x() + bar.get_width()/2., height,
+                       f'{acc:.2%}', ha='center', va='bottom')
+        plt.tight_layout()
+        plots.append(fig)
+    # 4. Speed comparison
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
+    prefill_times = []
+    decode_times = []
+    for method, data in results.items():
+        if 'metrics' in data:
+            metrics = data['metrics']
+            if hasattr(metrics, 'prefill_time_mean'):
+                prefill_times.append(metrics.prefill_time_mean * 1000)  # Convert to ms
+            if hasattr(metrics, 'decode_time_per_token_mean_ms'):
+                decode_times.append(metrics.decode_time_per_token_mean_ms)
+    if prefill_times:
+        ax1.bar(methods[:len(prefill_times)], prefill_times, color='purple', alpha=0.7)
+        ax1.set_ylabel('Time (ms)')
+        ax1.set_title('Prefill Time')
+        ax1.grid(True, alpha=0.3)
+    if decode_times:
+        ax2.bar(methods[:len(decode_times)], decode_times, color='orange', alpha=0.7)
+        ax2.set_ylabel('Time per Token (ms)')
+        ax2.set_title('Decode Time')
+        ax2.grid(True, alpha=0.3)
+    plt.suptitle('Performance Metrics: Speed Comparison')
+    plt.tight_layout()
+    plots.append(fig)
+    return plots
+def create_summary_text(results: Dict, benchmark_type: str) -> str:
+    """Create detailed summary text from results."""
+    summary_lines = []
+    summary_lines.append("=" * 60)
+    summary_lines.append("BENCHMARK RESULTS SUMMARY")
+    summary_lines.append("=" * 60)
+    summary_lines.append(f"Benchmark Type: {benchmark_type.upper()}")
+    summary_lines.append(f"Timestamp: {datetime.now().isoformat()}")
+    summary_lines.append("")
+    for method, data in results.items():
+        if 'summary' not in data:
+            continue
+        summary = data['summary']
+        metrics = data['metrics'] if 'metrics' in data else None
+        summary_lines.append(f"Method: {method.upper()}")
+        summary_lines.append("-" * 40)
+        # Compression metrics
+        if 'compression_ratio' in summary:
+            summary_lines.append(f"Compression Ratio: {summary['compression_ratio']:.1f}x")
+        if 'kv_cache_memory_mb' in summary:
+            summary_lines.append(f"KV Cache Memory: {summary['kv_cache_memory_mb']:.2f} MB")
+        # Quality metrics
+        if benchmark_type == "wikitext":
+            if 'prefill_perplexity' in summary:
+                summary_lines.append(f"Prefill Perplexity: {summary['prefill_perplexity']:.2f}")
+            if 'generation_perplexity' in summary:
+                summary_lines.append(f"Generation Perplexity: {summary['generation_perplexity']:.2f}")
+        elif benchmark_type == "niah" and 'niah_accuracy' in summary:
+            summary_lines.append(f"NIAH Accuracy: {summary['niah_accuracy']:.2%}")
+        elif benchmark_type == "ruler" and 'ruler_exact_match' in summary:
+            summary_lines.append(f"RULER Exact Match: {summary['ruler_exact_match']:.2%}")
+        elif benchmark_type == "scbench" and 'scbench_accuracy' in summary:
+            summary_lines.append(f"SCBench Accuracy: {summary['scbench_accuracy']:.2%}")
+        elif benchmark_type == "longbench" and 'longbench_accuracy' in summary:
+            summary_lines.append(f"LongBench Accuracy: {summary['longbench_accuracy']:.2%}")
+        # Performance metrics
+        if 'prefill_time_ms' in summary:
+            summary_lines.append(f"Prefill Time: {summary['prefill_time_ms']:.2f} ms")
+        if 'decode_time_ms' in summary:
+            summary_lines.append(f"Decode Time per Token: {summary['decode_time_ms']:.2f} ms")
+        if 'throughput_tokens_sec' in summary:
+            summary_lines.append(f"Throughput: {summary['throughput_tokens_sec']:.1f} tokens/sec")
+        if 'end_to_end_throughput' in summary:
+            summary_lines.append(f"End-to-End Throughput: {summary['end_to_end_throughput']:.1f} tokens/sec")
+        if 'peak_memory_mb' in summary:
+            summary_lines.append(f"Peak Memory: {summary['peak_memory_mb']:.2f} MB")
+        summary_lines.append("")
+    # Add statistical comparison if baseline is available
+    if 'none' in results and len(results) > 1:
+        summary_lines.append("COMPARISON WITH BASELINE")
+        summary_lines.append("-" * 40)
+        baseline_summary = results['none']['summary']
+        for method, data in results.items():
+            if method == 'none' or 'summary' not in data:
+                continue
+            summary = data['summary']
+            # Calculate improvements
+            if 'compression_ratio' in summary:
+                summary_lines.append(f"{method.upper()} vs Baseline:")
+                summary_lines.append(f"  Compression: {summary['compression_ratio']:.1f}x")
+            if 'kv_cache_memory_mb' in summary and 'kv_cache_memory_mb' in baseline_summary:
+                baseline_mem = baseline_summary['kv_cache_memory_mb']
+                method_mem = summary['kv_cache_memory_mb']
+                if baseline_mem > 0:
+                    reduction = (1 - method_mem / baseline_mem) * 100
+                    summary_lines.append(f"  Memory Reduction: {reduction:.1f}%")
+            # Quality degradation for WikiText
+            if benchmark_type == "wikitext":
+                if 'generation_perplexity' in summary and 'generation_perplexity' in baseline_summary:
+                    baseline_ppl = baseline_summary['generation_perplexity']
+                    method_ppl = summary['generation_perplexity']
+                    if baseline_ppl > 0:
+                        degradation = ((method_ppl - baseline_ppl) / baseline_ppl) * 100
+                        summary_lines.append(f"  Perplexity Change: {degradation:+.1f}%")
+            # Accuracy comparison for other benchmarks
+            elif benchmark_type == "niah":
+                if 'niah_accuracy' in summary and 'niah_accuracy' in baseline_summary:
+                    acc_diff = summary['niah_accuracy'] - baseline_summary['niah_accuracy']
+                    summary_lines.append(f"  Accuracy Difference: {acc_diff:+.2%}")
+            summary_lines.append("")
+    return "\n".join(summary_lines)
+def export_results(format_type):
+    """Export current results in specified format."""
+    if not current_results:
+        return "No results to export. Please run a benchmark first."
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if format_type == "JSON":
+        filename = f"results_{timestamp}.json"
+        # Convert numpy types to Python types for JSON serialization
+        def convert_numpy(obj):
+            if isinstance(obj, np.ndarray):
+                return obj.tolist()
+            elif isinstance(obj, (np.integer, np.int64, np.int32)):
+                return int(obj)
+            elif isinstance(obj, (np.floating, np.float64, np.float32)):
+                return float(obj)
+            elif isinstance(obj, BenchmarkMetrics):
+                return obj.__dict__
+            return obj
+        serializable_results = json.loads(
+            json.dumps(current_results, default=convert_numpy)
+        )
+        with open(filename, 'w') as f:
+            json.dump(serializable_results, f, indent=2)
+        return f"Results exported to {filename}"
+    elif format_type == "CSV":
+        filename = f"results_{timestamp}.csv"
+        # Flatten results for CSV
+        rows = []
+        for method, data in current_results.items():
+            if 'summary' in data:
+                row = {'method': method}
+                row.update(data['summary'])
+                rows.append(row)
+        if rows:
+            df = pd.DataFrame(rows)
+            df.to_csv(filename, index=False)
+            return f"Results exported to {filename}"
+        else:
+            return "No summary data to export"
+    elif format_type == "LaTeX":
+        filename = f"results_{timestamp}.tex"
+        # Create LaTeX table
+        latex_lines = [
+            "\\begin{table}[h]",
+            "\\centering",
+            "\\caption{KV Cache Compression Results}",
+            "\\begin{tabular}{lccc}",
+            "\\hline",
+            "Method & Compression & Memory (MB) & Throughput (tok/s) \\\\",
+            "\\hline"
+        ]
+        for method, data in current_results.items():
+            if 'summary' in data:
+                s = data['summary']
+                comp = f"{s.get('compression_ratio', 1.0):.1f}x"
+                mem = f"{s.get('kv_cache_memory_mb', 0):.1f}"
+                thr = f"{s.get('throughput_tokens_sec', 0):.1f}"
+                latex_lines.append(f"{method.upper()} & {comp} & {mem} & {thr} \\\\")
+        latex_lines.extend([
+            "\\hline",
+            "\\end{tabular}",
+            "\\end{table}"
+        ])
+        with open(filename, 'w') as f:
+            f.write('\n'.join(latex_lines))
+        return f"LaTeX table exported to {filename}"
+    return "Invalid export format"
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="RocketKV-Enhanced SPG Benchmark") as demo:
+        gr.Markdown("""
+        # 🚀 RocketKV-Enhanced SPG Compression Benchmark
+        Research-grade KV cache compression with **450x compression capability**.
+        Implements Enhanced Sliding Precision Gradient with RocketKV-style optimizations.
+        **Features:**
+        - Multiple compression methods (SPG, Adaptive, Enhanced, Progressive)
+        - Comprehensive benchmarks (WikiText, NIAH, RULER, SCBench, LongBench)
+        - Attestable proof generation and verification
+        - Real-time visualization and analysis
+        """)
+        with gr.Tab("Configuration"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Model & Benchmark Settings")
+                    model_dropdown = gr.Dropdown(
+                        choices=list(SUPPORTED_MODELS.keys()),
+                        value="gpt2",
+                        label="Model"
+                    )
+                    compression_dropdown = gr.Dropdown(
+                        choices=["none", "spg", "adaptive_spg", "enhanced_spg", "progressive_spg"],
+                        value="enhanced_spg",
+                        label="Compression Method"
+                    )
+                    benchmark_dropdown = gr.Dropdown(
+                        choices=["wikitext", "niah", "ruler", "scbench", "longbench"],
+                        value="wikitext",
+                        label="Benchmark Type"
+                    )
+                    dataset_subset = gr.Dropdown(
+                        choices=BENCHMARK_CONFIGS["longbench"]["subsets"],
+                        value="narrativeqa",
+                        label="LongBench Subset (if applicable)",
+                        visible=False
+                    )
+                    # Show/hide subset based on benchmark type
+                    def update_subset_visibility(benchmark_type):
+                        return gr.update(visible=(benchmark_type == "longbench"))
+                    benchmark_dropdown.change(
+                        update_subset_visibility,
+                        inputs=[benchmark_dropdown],
+                        outputs=[dataset_subset]
+                    )
+                with gr.Column():
+                    gr.Markdown("### Evaluation Parameters")
+                    eval_samples = gr.Slider(1, 100, value=20, step=1, label="Evaluation Samples")
+                    n_seeds = gr.Slider(1, 5, value=3, step=1, label="Random Seeds")
+                    seq_length = gr.Slider(128, 1024, value=512, step=128,
+                                          label="Sequence Length (max 1024 for GPT-2)")
+                    generation_length = gr.Slider(16, 128, value=64, step=16, label="Generation Length")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### SPG Core Parameters")
+                    base_decay = gr.Slider(0.8, 0.99, value=0.95, step=0.01, label="Base Decay Rate")
+                    sink_tokens = gr.Slider(0, 8, value=2, step=1, label="Sink Tokens")
+                    recent_window = gr.Slider(8, 64, value=32, step=8, label="Recent Window")
+                with gr.Column():
+                    gr.Markdown("### Adaptive SPG")
+                    enable_adaptive = gr.Checkbox(value=False, label="Enable Adaptive")
+                    target_ppl_delta = gr.Slider(0.5, 5.0, value=1.8, step=0.1,
+                                                label="Target Perplexity Delta")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Progressive Compression")
+                    enable_progressive = gr.Checkbox(value=False, label="Enable Progressive")
+                    quality_threshold = gr.Slider(0.005, 0.05, value=0.01, step=0.005,
+                                                 label="Quality Threshold")
+                    initial_compression = gr.Slider(10.0, 200.0, value=50.0, step=5.0,
+                                                   label="Initial Compression Ratio")
+                    max_compression = gr.Slider(100.0, 500.0, value=400.0, step=25.0,
+                                               label="Max Compression Ratio")
+                with gr.Column():
+                    gr.Markdown("### Enhanced SPG (RocketKV-style)")
+                    sequence_comp_ratio = gr.Slider(0.0001, 0.001, value=0.0001, step=0.00005,
+                                                   label="Sequence Compression Ratio")
+                    head_comp_ratio = gr.Slider(0.0001, 0.001, value=0.0001, step=0.00005,
+                                               label="Head Compression Ratio")
+                    head_retention = gr.Dropdown(
+                        choices=["conservative", "aggressive"],
+                        value="aggressive",
+                        label="Head Retention Mode"
+                    )
+                    magnitude_mode = gr.Dropdown(
+                        choices=["conservative", "aggressive", "extreme"],
+                        value="aggressive",  # Changed from "extreme" for stability
+                        label="Magnitude Threshold Mode"
+                    )
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Stability Parameters")
+                    min_tokens_stability = gr.Slider(4, 16, value=8, step=1,
+                                                    label="Min Tokens for Stability")
+                    recent_boost = gr.Slider(0.0, 0.5, value=0.1, step=0.05,
+                                           label="Recent Boost Factor")
+                with gr.Column():
+                    gr.Markdown("### System Settings")
+                    fail_on_cpu = gr.Checkbox(value=False, label="Fail on CPU Fallback")
+        with gr.Tab("Run Benchmark"):
+            run_button = gr.Button("🚀 Run Benchmark", variant="primary")
+            with gr.Row():
+                progress_text = gr.Textbox(label="Progress", lines=10)
+            with gr.Row():
+                plot_gallery = gr.Gallery(label="Results Visualization", columns=2, height="auto")
+            with gr.Row():
+                summary_output = gr.Textbox(label="Summary", lines=20)
+                verification_output = gr.Textbox(label="Proof Verification", lines=5)
+        with gr.Tab("Export Results"):
+            gr.Markdown("### Export Options")
+            export_format = gr.Radio(
+                choices=["JSON", "CSV", "LaTeX"],
+                value="JSON",
+                label="Export Format"
+            )
+            export_button = gr.Button("📥 Export Results")
+            export_status = gr.Textbox(label="Export Status")
+            export_button.click(
+                export_results,
+                inputs=[export_format],
+                outputs=[export_status]
+            )
+        # Connect the run button
+        run_button.click(
+            run_benchmark,
+            inputs=[
+                model_dropdown, compression_dropdown, benchmark_dropdown, dataset_subset,
+                eval_samples, n_seeds, seq_length, generation_length,
+                base_decay, sink_tokens, recent_window,
+                enable_adaptive, target_ppl_delta,
+                enable_progressive, quality_threshold,
+                initial_compression, max_compression,
+                sequence_comp_ratio, head_comp_ratio,
+                head_retention, magnitude_mode,
+                min_tokens_stability, recent_boost,
+                fail_on_cpu
+            ],
+            outputs=[plot_gallery, summary_output, verification_output]
+        )
+    return demo
+if __name__ == "__main__":
+    # Set up logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    # Create and launch the interface
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )