Spaces:

AdnanElAssadi
/

MTEB-Human-Eval-Demo

Sleeping

App Files Files Community

AdnanElAssadi commited on Apr 7

Commit

9f8b4b9

verified ·

1 Parent(s): e1f1819

Update app.py

Browse files

Files changed (1) hide show

app.py +707 -261

app.py CHANGED Viewed

@@ -3,69 +3,41 @@ import json
 import os
 from pathlib import Path
-# Create a minimal demo with hardcoded sample data
-SAMPLE_TASK = {
-    "task_name": "Demo Reranking Task",
-    "task_type": "reranking",
-    "instructions": "Rank the documents from most relevant to least relevant for the given query.",
-    "samples": [
-        {
-            "id": "sample_1",
-            "query": "How do I install Python on Windows?",
-            "candidates": [
-                "To install Python on Windows, go to python.org and download the latest installer. Run the installer and make sure to check 'Add Python to PATH' during installation.",
-                "Python is a popular programming language used for web development, data analysis, and machine learning.",
-                "Windows is an operating system developed by Microsoft. It's the most popular desktop operating system worldwide.",
-                "Installing software on Windows typically involves downloading an installer and running it with administrator privileges.",
-                "Programming languages like Python, JavaScript, and Java are essential tools for modern software development."
-            ]
-        },
-        {
-            "id": "sample_2",
-            "query": "What are the benefits of exercise?",
-            "candidates": [
-                "Regular exercise improves cardiovascular health, strengthens muscles, and can help with weight management.",
-                "Exercise releases endorphins which can improve mood and reduce feelings of depression and anxiety.",
-                "A balanced diet is important for maintaining good health and providing energy for daily activities.",
-                "Regular physical activity can reduce the risk of chronic diseases such as heart disease, diabetes, and certain cancers.",
-                "Sleep is essential for recovery and overall health, with most adults needing 7-9 hours per night."
-            ]
-        }
-    ]
-}
 def create_reranking_interface(task_data):
     """Create a Gradio interface for reranking evaluation."""
     samples = task_data["samples"]
     results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
     completed_samples = {s["id"]: False for s in samples}
-    # Store the current document order for the active sample
-    current_order = []
-    def save_ranking(sample_id):
-        """Save the current document ordering as rankings."""
         try:
-            if not current_order:
-                return "⚠️ No document ordering found", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-            # Convert document positions to rankings
-            # First document (position 0) gets rank 1, etc.
-            rankings = []
-            for i, doc_idx in enumerate(current_order):
-                rankings.append(i + 1)  # Convert to 1-based ranks
             # Store this annotation in memory
             existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
             if existing_idx is not None:
                 results["annotations"][existing_idx] = {
                     "sample_id": sample_id,
-                    "rankings": rankings
                 }
             else:
                 results["annotations"].append({
                     "sample_id": sample_id,
-                    "rankings": rankings
                 })
             completed_samples[sample_id] = True
@@ -76,12 +48,10 @@ def create_reranking_interface(task_data):
                 with open(output_path, "w") as f:
                     json.dump(results, f, indent=2)
                 return f"✅ Rankings saved successfully (in memory and to file)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-            except Exception as write_error:
-                print(f"Error writing results file: {str(write_error)}")
                 # If file saving fails, still mark as success since we saved in memory
-                return f"✅ Rankings saved in memory (file save failed: {str(write_error)})", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
         except Exception as e:
-            print(f"Error in save_ranking: {str(e)}")
             # Return specific error message
             return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
@@ -96,11 +66,12 @@ def create_reranking_interface(task_data):
             ### How to use this interface:
             1. Read the query at the top
-            2. Review each document in the list
-            3. Use the "Move Up" and "Move Down" buttons to arrange documents by relevance
-               (most relevant at the top, least relevant at the bottom)
-            4. Click "Submit Rankings" when you're done with the current query
-            5. Use "Previous" and "Next" to navigate between queries
             """.format(instructions=task_data["instructions"]))
         current_sample_id = gr.State(value=samples[0]["id"])
@@ -113,236 +84,711 @@ def create_reranking_interface(task_data):
             gr.Markdown("## Query:")
             query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False)
-            gr.Markdown("## Documents (Arrange in order of relevance, most relevant at top):")
-            # Create simple document list with move up/down buttons
-            document_containers = []
-            # Function to initialize the document list for a sample
-            def initialize_document_list(sample_id):
-                nonlocal current_order
-                try:
-                    sample = next((s for s in samples if s["id"] == sample_id), None)
-                    if not sample:
-                        return "Query not found", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-                    # Get the documents for this sample
-                    docs = sample["candidates"]
-                    # Initialize document order (0, 1, 2, ..., n-1)
-                    current_order = list(range(len(docs)))
-                    # Check if this sample has already been annotated to restore ordering
-                    existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
-                    if existing_annotation and "rankings" in existing_annotation:
-                        # Create pairs of (doc_idx, rank)
-                        ranked_docs = []
-                        for doc_idx, rank in enumerate(existing_annotation["rankings"]):
-                            ranked_docs.append((doc_idx, rank))
-                        # Sort by rank (ascending)
-                        ranked_docs.sort(key=lambda x: x[1])
-                        # Extract document indices in rank order
-                        current_order = [doc[0] for doc in ranked_docs]
-                    # Update UI
-                    for i in range(len(document_containers)):
-                        if i < len(docs):
-                            doc_idx = current_order[i]
-                            document_containers[i].value = f"Document {doc_idx+1} (Rank {i+1}): {docs[doc_idx]}"
-                        else:
-                            document_containers[i].value = ""  # Clear unused containers
-                    # Status message
-                    status = f"Viewing query {samples.index(sample) + 1} of {len(samples)}"
-                    if completed_samples[sample_id]:
-                        status += " (already completed)"
-                    return status, f"Progress: {sum(completed_samples.values())}/{len(samples)}"
-                except Exception as e:
-                    print(f"Error in initialize_document_list: {str(e)}")
-                    return f"Error initializing documents: {str(e)}", "Progress: 0/0"
-            # Create document display containers with up/down buttons
-            with gr.Column():
-                # Display up to 10 documents (or however many are in the largest sample)
-                max_docs = max(len(s["candidates"]) for s in samples)
-                for i in range(max_docs):
-                    with gr.Group():
-                        doc_text = gr.Textbox(label=f"Document {i+1}", interactive=False)
-                        document_containers.append(doc_text)
-                        with gr.Row():
-                            up_btn = gr.Button(f"⬆️ Move Up", size="sm")
-                            down_btn = gr.Button(f"⬇️ Move Down", size="sm")
-                        # Create closures for up/down buttons
-                        def make_up_handler(idx):
-                            def up_handler():
-                                nonlocal current_order
-                                if idx > 0:
-                                    # Swap with document above
-                                    current_order[idx], current_order[idx-1] = current_order[idx-1], current_order[idx]
-                                    # Get current sample
-                                    sample = next((s for s in samples if s["id"] == current_sample_id.value), None)
-                                    if sample:
-                                        docs = sample["candidates"]
-                                        # Update document displays
-                                        for j in range(len(document_containers)):
-                                            if j < len(docs):
-                                                doc_idx = current_order[j]
-                                                document_containers[j].value = f"Document {doc_idx+1} (Rank {j+1}): {docs[doc_idx]}"
-                                # Return empty list since we update the containers directly
-                                return {}
-                            return up_handler
-                        def make_down_handler(idx):
-                            def down_handler():
-                                nonlocal current_order
-                                if idx < len(current_order) - 1:
-                                    # Swap with document below
-                                    current_order[idx], current_order[idx+1] = current_order[idx+1], current_order[idx]
-                                    # Get current sample
-                                    sample = next((s for s in samples if s["id"] == current_sample_id.value), None)
-                                    if sample:
-                                        docs = sample["candidates"]
-                                        # Update document displays
-                                        for j in range(len(document_containers)):
-                                            if j < len(docs):
-                                                doc_idx = current_order[j]
-                                                document_containers[j].value = f"Document {doc_idx+1} (Rank {j+1}): {docs[doc_idx]}"
-                                # Return empty list since we update the containers directly
-                                return {}
-                            return down_handler
-                        # Connect buttons
-                        up_btn.click(
-                            fn=make_up_handler(i),
-                            inputs=None,
-                            outputs=None
-                        )
-                        down_btn.click(
-                            fn=make_down_handler(i),
-                            inputs=None,
-                            outputs=None
-                        )
-                with gr.Row():
-                    prev_btn = gr.Button("← Previous Query", size="sm")
-                    submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
-                    next_btn = gr.Button("Next Query →", size="sm")
-            # Navigation functions
-            def nav_to_prev(current_id):
-                try:
-                    current_sample = next((s for s in samples if s["id"] == current_id), None)
-                    if not current_sample:
-                        return current_id
-                    current_idx = samples.index(current_sample)
-                    if current_idx > 0:
-                        prev_sample = samples[current_idx - 1]
-                        return prev_sample["id"]
-                    return current_id
-                except Exception as e:
-                    print(f"Error in nav_to_prev: {str(e)}")
-                    return current_id
-            def nav_to_next(current_id):
-                try:
-                    current_sample = next((s for s in samples if s["id"] == current_id), None)
-                    if not current_sample:
-                        return current_id
-                    current_idx = samples.index(current_sample)
-                    if current_idx < len(samples) - 1:
-                        next_sample = samples[current_idx + 1]
-                        return next_sample["id"]
-                    return current_id
-                except Exception as e:
-                    print(f"Error in nav_to_next: {str(e)}")
-                    return current_id
-            # Update query text
-            def update_query(sample_id):
-                try:
-                    sample = next((s for s in samples if s["id"] == sample_id), None)
-                    if not sample:
-                        return ""
-                    return sample["query"]
-                except Exception as e:
-                    print(f"Error in update_query: {str(e)}")
-                    return "Error loading query"
-            # Initialize the first sample
-            initialize_document_list(samples[0]["id"])
-            # Connect navigation buttons
-            prev_btn.click(
-                nav_to_prev,
-                inputs=[current_sample_id],
-                outputs=[current_sample_id]
-            ).then(
-                initialize_document_list,
-                inputs=[current_sample_id],
-                outputs=[status_box, progress_text]
-            ).then(
-                update_query,
-                inputs=[current_sample_id],
-                outputs=[query_text]
-            )
-            next_btn.click(
-                nav_to_next,
-                inputs=[current_sample_id],
-                outputs=[current_sample_id]
-            ).then(
-                initialize_document_list,
-                inputs=[current_sample_id],
-                outputs=[status_box, progress_text]
-            ).then(
-                update_query,
-                inputs=[current_sample_id],
-                outputs=[query_text]
             )
-            # Connect submit button
-            submit_btn.click(
-                save_ranking,
-                inputs=[current_sample_id],
-                outputs=[status_box, progress_text]
             )
     return demo
-# Simple Gradio app with hardcoded sample data
-with gr.Blocks(theme=gr.themes.Soft()) as app:
     gr.Markdown("# MTEB Human Evaluation Demo")
-    gr.Markdown("## Sample Reranking Task")
-    # Create and display the demo interface
-    demo = create_reranking_interface(SAMPLE_TASK)
-if __name__ == "__main__":
-    try:
-        print("Starting MTEB Human Evaluation App with hardcoded sample data...")
-        print(f"Current directory: {os.getcwd()}")
-        # Launch with hardcoded sample data
-        app.launch(debug=True)
-    except Exception as e:
-        print(f"ERROR STARTING APP: {str(e)}")
-        # Create an ultra-simple fallback app
-        with gr.Blocks() as fallback_app:
-            gr.Markdown("# MTEB Human Evaluation - Emergency Fallback Mode")
-            gr.Markdown("There was an error loading the application. Please see details below.")
-            gr.Textbox(value=str(e), label="Error", interactive=False)
-        fallback_app.launch()

 import os
 from pathlib import Path
 def create_reranking_interface(task_data):
     """Create a Gradio interface for reranking evaluation."""
     samples = task_data["samples"]
     results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
     completed_samples = {s["id"]: False for s in samples}
+    def save_ranking(rankings, sample_id):
+        """Save the current set of rankings."""
         try:
+            # Check if all documents have rankings
+            all_ranked = all(r is not None and r != "" for r in rankings)
+            if not all_ranked:
+                return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            # Convert rankings to integers with better error handling
+            try:
+                processed_rankings = [int(r) for r in rankings]
+            except ValueError:
+                return "⚠️ Invalid ranking value. Please use only numbers.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            # Check for duplicate rankings
+            if len(set(processed_rankings)) != len(processed_rankings):
+                return "⚠️ Each document must have a unique rank. Please review your rankings.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
             # Store this annotation in memory
             existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
             if existing_idx is not None:
                 results["annotations"][existing_idx] = {
                     "sample_id": sample_id,
+                    "rankings": processed_rankings
                 }
             else:
                 results["annotations"].append({
                     "sample_id": sample_id,
+                    "rankings": processed_rankings
                 })
             completed_samples[sample_id] = True
                 with open(output_path, "w") as f:
                     json.dump(results, f, indent=2)
                 return f"✅ Rankings saved successfully (in memory and to file)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            except:
                 # If file saving fails, still mark as success since we saved in memory
+                return f"✅ Rankings saved in memory (file save failed)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
         except Exception as e:
             # Return specific error message
             return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
             ### How to use this interface:
             1. Read the query at the top
+            2. Review each document carefully
+            3. Assign a rank to each document (1 = most relevant, higher numbers = less relevant)
+            4. Each document must have a unique rank
+            5. Click "Submit Rankings" when you're done with the current query
+            6. Use "Previous" and "Next" to navigate between queries
+            7. Click "Save All Results" periodically to ensure your work is saved
             """.format(instructions=task_data["instructions"]))
         current_sample_id = gr.State(value=samples[0]["id"])
             gr.Markdown("## Query:")
             query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False)
+            gr.Markdown("## Documents to Rank:")
+            # Create document displays and ranking dropdowns in synchronized pairs
+            doc_containers = []
+            ranking_dropdowns = []
+            with gr.Column():
+                for i, doc in enumerate(samples[0]["candidates"]):
+                    with gr.Row():
+                        doc_box = gr.Textbox(
+                            value=doc,
+                            label=f"Document {i+1}",
+                            interactive=False,
+                            elem_classes="doc-box"
+                        )
+                        dropdown = gr.Dropdown(
+                            choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
+                            label=f"Rank",
+                            value="",
+                            elem_classes="ranking-dropdown"
+                        )
+                        # Add Quick Rank buttons for fast selection
+                        with gr.Column(scale=1, min_width=120):
+                            gr.Markdown(f"Quick Rank", elem_classes="quick-rank-label")
+                            with gr.Row():
+                                # Add first 5 rank buttons (or fewer if there are fewer candidates)
+                                num_buttons = min(5, len(samples[0]["candidates"]))
+                                for r in range(1, num_buttons + 1):
+                                    button = gr.Button(f"{r}", size="sm", elem_classes=f"quick-rank-btn quick-rank-btn-{i}-{r}")
+                                    # Use JavaScript to set the dropdown value when clicked
+                                    button.click(
+                                        None,
+                                        [],
+                                        [],
+                                        _js=f"() => {{ document.querySelectorAll('.ranking-dropdown')[{i}].value = '{r}'; return []; }}"
+                                    )
+                        doc_containers.append(doc_box)
+                        ranking_dropdowns.append(dropdown)
+            # Add keyboard shortcuts explanation
+            with gr.Accordion("Keyboard Shortcuts", open=False):
+                gr.Markdown("""
+                ### Keyboard Shortcuts
+                - When a document text box is focused:
+                  - Press number keys (1-9) to assign rankings quickly
+                - Navigation:
+                  - Press 'n' to go to the next query
+                  - Press 'p' to go to the previous query
+                  - Press 's' to submit the current rankings
+                """)
+            # Add JavaScript for keyboard shortcuts
+            gr.HTML("""
+            <script>
+            document.addEventListener('DOMContentLoaded', function() {
+                // Wait for Gradio elements to be fully loaded
+                setTimeout(() => {
+                    // Get all document textboxes
+                    const docBoxes = document.querySelectorAll('.doc-box');
+                    const dropdowns = document.querySelectorAll('.ranking-dropdown');
+                    // Add event listeners to document boxes
+                    docBoxes.forEach((box, index) => {
+                        box.addEventListener('click', function() {
+                            // Mark this box as active for keyboard shortcuts
+                            docBoxes.forEach(b => b.classList.remove('active-doc'));
+                            box.classList.add('active-doc');
+                        });
+                    });
+                    // Add event listeners to dropdowns for color coding
+                    dropdowns.forEach((dropdown, index) => {
+                        dropdown.addEventListener('change', function() {
+                            updateDropdownColor(dropdown, docBoxes[index]);
+                        });
+                    });
+                    // Function to update color based on rank
+                    function updateDropdownColor(dropdown, docBox) {
+                        const value = dropdown.value;
+                        if (!value) return;
+                        // Remove existing color classes
+                        dropdown.classList.remove('rank-1', 'rank-2', 'rank-3', 'rank-4', 'rank-5', 'rank-high');
+                        // Add appropriate color class
+                        if (value === '1') dropdown.classList.add('rank-1');
+                        else if (value === '2') dropdown.classList.add('rank-2');
+                        else if (value === '3') dropdown.classList.add('rank-3');
+                        else if (value === '4') dropdown.classList.add('rank-4');
+                        else if (value === '5') dropdown.classList.add('rank-5');
+                        else dropdown.classList.add('rank-high');
+                        // Add highlighting to document box
+                        docBox.classList.add('ranked-doc');
+                    }
+                    // Add global keyboard listener
+                    document.addEventListener('keydown', function(e) {
+                        // Number keys 1-9 for ranking
+                        if (e.key >= '1' && e.key <= '9') {
+                            const activeDoc = document.querySelector('.active-doc');
+                            if (activeDoc) {
+                                const index = Array.from(docBoxes).indexOf(activeDoc);
+                                const dropdown = document.querySelectorAll('.ranking-dropdown')[index];
+                                if (dropdown) {
+                                    dropdown.value = e.key;
+                                    dropdown.dispatchEvent(new Event('change'));
+                                    updateDropdownColor(dropdown, activeDoc);
+                                }
+                            }
+                        }
+                        // Navigation shortcuts
+                        if (e.key === 'n') {
+                            // Next query
+                            document.querySelector('#next-btn').click();
+                        } else if (e.key === 'p') {
+                            // Previous query
+                            document.querySelector('#prev-btn').click();
+                        } else if (e.key === 's') {
+                            // Submit rankings
+                            document.querySelector('#submit-btn').click();
+                        }
+                    });
+                    // Add some CSS for active document
+                    const style = document.createElement('style');
+                    style.textContent = `
+                        .active-doc {
+                            border-left: 3px solid #3B82F6 !important;
+                            background-color: rgba(59, 130, 246, 0.05) !important;
+                        }
+                        .ranked-doc {
+                            border-bottom: 2px solid #4ADE80 !important;
+                        }
+                        .rank-1 {
+                            background-color: rgba(74, 222, 128, 0.2) !important;
+                            font-weight: bold !important;
+                        }
+                        .rank-2 {
+                            background-color: rgba(74, 222, 128, 0.15) !important;
+                        }
+                        .rank-3 {
+                            background-color: rgba(251, 191, 36, 0.15) !important;
+                        }
+                        .rank-4 {
+                            background-color: rgba(251, 191, 36, 0.1) !important;
+                        }
+                        .rank-5 {
+                            background-color: rgba(239, 68, 68, 0.1) !important;
+                        }
+                        .rank-high {
+                            background-color: rgba(239, 68, 68, 0.05) !important;
+                        }
+                        .quick-rank-label {
+                            margin-bottom: 0 !important;
+                            font-size: 0.8rem !important;
+                            opacity: 0.8;
+                        }
+                        .quick-rank-btn {
+                            min-width: 20px !important;
+                            height: 24px !important;
+                            line-height: 1 !important;
+                            padding: 2px 6px !important;
+                        }
+                    `;
+                    document.head.appendChild(style);
+                }, 1000);
+            });
+            </script>
+            """)
+            # Add visual ranking mode option
+            with gr.Row():
+                visual_mode_btn = gr.Button("Toggle Visual Ranking Mode", size="sm")
+                reset_rankings_btn = gr.Button("Reset Rankings", size="sm", variant="secondary")
+            # Visual ranking display
+            with gr.Column(visible=False) as visual_ranking_container:
+                gr.Markdown("## Current Rankings (Most to Least Relevant)")
+                ranked_display = gr.HTML("No rankings yet")
+            # Function to toggle visual ranking mode
+            def toggle_visual_mode(visible):
+                return not visible
+            # Function to update visual ranking display
+            def update_visual_ranking(*rankings):
+                # Convert to integers with error handling
+                clean_rankings = []
+                for r in rankings:
+                    try:
+                        if r and r.strip():
+                            clean_rankings.append(int(r))
+                        else:
+                            clean_rankings.append(None)
+                    except ValueError:
+                        clean_rankings.append(None)
+                # Check if any rankings exist
+                if not any(r is not None for r in clean_rankings):
+                    return "<p>No rankings assigned yet.</p>"
+                # Create sorted order
+                ranked_indices = []
+                for rank in range(1, len(clean_rankings) + 1):
+                    try:
+                        idx = clean_rankings.index(rank)
+                        ranked_indices.append(idx)
+                    except ValueError:
+                        pass
+                # Build HTML
+                html = "<div class='visual-ranking'>"
+                for i, idx in enumerate(ranked_indices):
+                    rank = i + 1
+                    doc_text = doc_containers[idx].value
+                    # Apply color classes based on rank
+                    rank_class = ""
+                    if rank == 1:
+                        rank_class = "visual-rank-1"
+                    elif rank == 2:
+                        rank_class = "visual-rank-2"
+                    elif rank == 3:
+                        rank_class = "visual-rank-3"
+                    elif rank <= 5:
+                        rank_class = "visual-rank-45"
+                    else:
+                        rank_class = "visual-rank-high"
+                    html += f"""
+                    <div class='visual-rank-item {rank_class}'>
+                        <div class='visual-rank-number'>{rank}</div>
+                        <div class='visual-rank-content'>{doc_text}</div>
+                    </div>
+                    """
+                # Add unranked items if any
+                unranked_indices = [i for i, r in enumerate(clean_rankings) if r is None]
+                if unranked_indices:
+                    html += "<h3>Unranked Documents</h3>"
+                    for idx in unranked_indices:
+                        doc_text = doc_containers[idx].value
+                        html += f"""
+                        <div class='visual-rank-item visual-rank-unranked'>
+                            <div class='visual-rank-number'>?</div>
+                            <div class='visual-rank-content'>{doc_text}</div>
+                        </div>
+                        """
+                html += "</div>"
+                # Add CSS
+                html += """
+                <style>
+                .visual-ranking {
+                    margin-top: 15px;
+                }
+                .visual-rank-item {
+                    display: flex;
+                    margin-bottom: 15px;
+                    padding: 10px;
+                    border-radius: 8px;
+                }
+                .visual-rank-number {
+                    font-size: 18px;
+                    font-weight: bold;
+                    margin-right: 10px;
+                    min-width: 30px;
+                    height: 30px;
+                    border-radius: 15px;
+                    background-color: #e5e7eb;
+                    display: flex;
+                    align-items: center;
+                    justify-content: center;
+                }
+                .visual-rank-content {
+                    flex: 1;
+                }
+                .visual-rank-1 {
+                    background-color: rgba(74, 222, 128, 0.2);
+                    border-left: 4px solid #4ADE80;
+                }
+                .visual-rank-2 {
+                    background-color: rgba(74, 222, 128, 0.15);
+                    border-left: 3px solid #4ADE80;
+                }
+                .visual-rank-3 {
+                    background-color: rgba(251, 191, 36, 0.15);
+                    border-left: 3px solid #FBBF24;
+                }
+                .visual-rank-45 {
+                    background-color: rgba(251, 191, 36, 0.1);
+                    border-left: 2px solid #FBBF24;
+                }
+                .visual-rank-high {
+                    background-color: rgba(239, 68, 68, 0.05);
+                    border-left: 2px solid #EF4444;
+                }
+                .visual-rank-unranked {
+                    background-color: #f9fafb;
+                    border: 1px dashed #d1d5db;
+                }
+                .visual-rank-unranked .visual-rank-number {
+                    background-color: #d1d5db;
+                }
+                </style>
+                """
+                return html
+            # Function to reset all rankings
+            def reset_rankings():
+                return ["" for _ in ranking_dropdowns]
+            # Connect events
+            visual_mode_btn.click(
+                toggle_visual_mode,
+                inputs=[visual_ranking_container],
+                outputs=[visual_ranking_container]
             )
+            # Update visual ranking when any dropdown changes
+            for dropdown in ranking_dropdowns:
+                dropdown.change(
+                    update_visual_ranking,
+                    inputs=ranking_dropdowns,
+                    outputs=[ranked_display]
+                )
+            # Reset rankings button
+            reset_rankings_btn.click(
+                reset_rankings,
+                outputs=ranking_dropdowns
             )
+            with gr.Row():
+                prev_btn = gr.Button("← Previous Query", size="sm", elem_id="prev-btn")
+                submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary", elem_id="submit-btn")
+                next_btn = gr.Button("Next Query →", size="sm", elem_id="next-btn")
+            save_btn = gr.Button("💾 Save All Results", variant="secondary")
+        def load_sample(sample_id):
+            """Load a specific sample into the interface."""
+            sample = next((s for s in samples if s["id"] == sample_id), None)
+            if not sample:
+                return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_dropdowns) + [current_sample_id.value, progress_text.value, status_box.value]
+            # Update query
+            new_query = sample["query"]
+            # Update documents
+            new_docs = []
+            for i, doc in enumerate(sample["candidates"]):
+                if i < len(doc_containers):
+                    new_docs.append(doc)
+            # Initialize rankings
+            new_rankings = [""] * len(ranking_dropdowns)
+            # Check if this sample has already been annotated
+            existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
+            if existing_annotation:
+                # Restore previous rankings
+                for i, rank in enumerate(existing_annotation["rankings"]):
+                    if i < len(new_rankings) and rank is not None:
+                        new_rankings[i] = str(rank)
+            # Update progress
+            current_idx = samples.index(sample)
+            new_progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
+            new_status = f"Viewing query {current_idx + 1} of {len(samples)}"
+            if completed_samples[sample_id]:
+                new_status += " (already completed)"
+            return [new_query] + new_docs + new_rankings + [sample["id"], new_progress, new_status]
+        def next_sample(current_id):
+            """Load the next sample."""
+            current_sample = next((s for s in samples if s["id"] == current_id), None)
+            if not current_sample:
+                return current_id
+            current_idx = samples.index(current_sample)
+            if current_idx < len(samples) - 1:
+                next_sample = samples[current_idx + 1]
+                return next_sample["id"]
+            return current_id
+        def prev_sample(current_id):
+            """Load the previous sample."""
+            current_sample = next((s for s in samples if s["id"] == current_id), None)
+            if not current_sample:
+                return current_id
+            current_idx = samples.index(current_sample)
+            if current_idx > 0:
+                prev_sample = samples[current_idx - 1]
+                return prev_sample["id"]
+            return current_id
+        def save_results():
+            """Save all collected results to a file."""
+            output_path = f"{task_data['task_name']}_human_results.json"
+            with open(output_path, "w") as f:
+                json.dump(results, f, indent=2)
+            return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
+        # Define a wrapper function that collects all the dropdown values into a list
+        def save_ranking_wrapper(*args):
+            # The last argument is the sample_id, all others are rankings
+            rankings = args[:-1]
+            sample_id = args[-1]
+            return save_ranking(rankings, sample_id)
+        # Connect events
+        submit_btn.click(
+            save_ranking_wrapper,
+            inputs=ranking_dropdowns + [current_sample_id],
+            outputs=[status_box, progress_text]
+        ).then(
+            update_visual_ranking,
+            inputs=ranking_dropdowns,
+            outputs=[ranked_display]
+        )
+        next_btn.click(
+            next_sample,
+            inputs=[current_sample_id],
+            outputs=[current_sample_id]
+        ).then(
+            load_sample,
+            inputs=[current_sample_id],
+            outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
+        ).then(
+            update_visual_ranking,
+            inputs=ranking_dropdowns,
+            outputs=[ranked_display]
+        )
+        prev_btn.click(
+            prev_sample,
+            inputs=[current_sample_id],
+            outputs=[current_sample_id]
+        ).then(
+            load_sample,
+            inputs=[current_sample_id],
+            outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
+        ).then(
+            update_visual_ranking,
+            inputs=ranking_dropdowns,
+            outputs=[ranked_display]
+        )
+        save_btn.click(save_results, outputs=[status_box])
     return demo
+# Main app with file upload capability
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# MTEB Human Evaluation Demo")
+    with gr.Tabs():
+        with gr.TabItem("Demo"):
+            gr.Markdown("""
+            ## MTEB Human Evaluation Interface
+            This interface allows you to evaluate the relevance of documents for reranking tasks.
+            """)
+            # Function to get the most recent task file
+            def get_latest_task_file():
+                # Check first in uploaded_tasks directory
+                os.makedirs("uploaded_tasks", exist_ok=True)
+                uploaded_tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
+                if uploaded_tasks:
+                    # Sort by modification time, newest first
+                    uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
+                    return os.path.join("uploaded_tasks", uploaded_tasks[0])
+                # Fall back to default example
+                return "AskUbuntuDupQuestions_human_eval.json"
+            # Load the task file
+            task_file = get_latest_task_file()
+            try:
+                with open(task_file, "r") as f:
+                    task_data = json.load(f)
+                # Show which task is currently loaded
+                gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
+                # Display the interface
+                reranking_demo = create_reranking_interface(task_data)
+            except Exception as e:
+                gr.Markdown(f"**Error loading task: {str(e)}**")
+                gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
+        with gr.TabItem("Upload & Evaluate"):
+            gr.Markdown("""
+            ## Upload Your Own Task File
+            If you have a prepared task file, you can upload it here to create an evaluation interface.
+            """)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    file_input = gr.File(label="Upload a task file (JSON)")
+                    load_btn = gr.Button("Load Task")
+                    message = gr.Textbox(label="Status", interactive=False)
+                    # Add task list for previously uploaded tasks
+                    gr.Markdown("### Previous Uploads")
+                    # Function to list existing task files in the tasks directory
+                    def list_task_files():
+                        os.makedirs("uploaded_tasks", exist_ok=True)
+                        tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
+                        if not tasks:
+                            return "No task files uploaded yet."
+                        return "\n".join([f"- [{t}](javascript:selectTask('{t}'))" for t in tasks])
+                    task_list = gr.Markdown(list_task_files())
+                    refresh_btn = gr.Button("Refresh List")
+                    # Add results management section
+                    gr.Markdown("### Results Management")
+                    # Function to list existing result files
+                    def list_result_files():
+                        results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
+                        if not results:
+                            return "No result files available yet."
+                        result_links = []
+                        for r in results:
+                            # Calculate completion stats
+                            try:
+                                with open(r, "r") as f:
+                                    result_data = json.load(f)
+                                annotation_count = len(result_data.get("annotations", []))
+                                task_name = result_data.get("task_name", "Unknown")
+                                result_links.append(f"- {r} ({annotation_count} annotations for {task_name})")
+                            except:
+                                result_links.append(f"- {r}")
+                        return "\n".join(result_links)
+                    results_list = gr.Markdown(list_result_files())
+                    download_results_btn = gr.Button("Download Results")
+                # Right side - will contain the actual interface
+                with gr.Column(scale=2):
+                    task_container = gr.HTML()
+            # Handle file upload and storage
+            def handle_upload(file):
+                if not file:
+                    return "Please upload a task file", task_list.value, task_container.value
+                try:
+                    # Create directory if it doesn't exist
+                    os.makedirs("uploaded_tasks", exist_ok=True)
+                    # Read the uploaded file
+                    with open(file.name, "r") as f:
+                        task_data = json.load(f)
+                    # Validate task format
+                    if "task_name" not in task_data or "samples" not in task_data:
+                        return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value
+                    # Save to a consistent location
+                    task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
+                    with open(task_filename, "w") as f:
+                        json.dump(task_data, f, indent=2)
+                    # Instead of trying to create the interface here,
+                    # we'll return a message with instructions
+                    return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
+                    <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
+                        <h3>Task uploaded successfully!</h3>
+                        <p>Task Name: {task_data['task_name']}</p>
+                        <p>Samples: {len(task_data['samples'])}</p>
+                        <p>To evaluate this task:</p>
+                        <ol>
+                            <li>Refresh the app</li>
+                            <li>The Demo tab will now use your uploaded task</li>
+                            <li>Complete your evaluations</li>
+                            <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
+                        </ol>
+                    </div>
+                    """
+                except Exception as e:
+                    return f"Error processing task file: {str(e)}", task_list.value, task_container.value
+            # Function to prepare results for download
+            def prepare_results_for_download():
+                results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
+                if not results:
+                    return None
+                # Create a zip file with all results
+                import zipfile
+                zip_path = "mteb_human_eval_results.zip"
+                with zipfile.ZipFile(zip_path, 'w') as zipf:
+                    for r in results:
+                        zipf.write(r)
+                return zip_path
+            # Connect events
+            load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container])
+            refresh_btn.click(list_task_files, outputs=[task_list])
+            download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
+        with gr.TabItem("Results Management"):
+            gr.Markdown("""
+            ## Manage Evaluation Results
+            View, download, and analyze your evaluation results.
+            """)
+            # Function to load and display result stats
+            def get_result_stats():
+                results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
+                if not results:
+                    return "No result files available yet."
+                stats = []
+                for r in results:
+                    try:
+                        with open(r, "r") as f:
+                            result_data = json.load(f)
+                        task_name = result_data.get("task_name", "Unknown")
+                        annotations = result_data.get("annotations", [])
+                        annotation_count = len(annotations)
+                        # Calculate completion percentage
+                        sample_ids = set(a.get("sample_id") for a in annotations)
+                        # Try to get the total sample count from the corresponding task file
+                        total_samples = 0
+                        task_file = f"uploaded_tasks/{task_name}_task.json"
+                        if os.path.exists(task_file):
+                            with open(task_file, "r") as f:
+                                task_data = json.load(f)
+                            total_samples = len(task_data.get("samples", []))
+                        completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
+                        stats.append(f"### {task_name}\n- Annotations: {annotation_count}\n- Completion: {completion}\n- File: {r}")
+                    except Exception as e:
+                        stats.append(f"### {r}\n- Error loading results: {str(e)}")
+                return "\n\n".join(stats)
+            result_stats = gr.Markdown(get_result_stats())
+            refresh_results_btn = gr.Button("Refresh Results")
+            # Add download options
+            with gr.Row():
+                download_all_btn = gr.Button("Download All Results (ZIP)")
+                result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
+                download_selected_btn = gr.Button("Download Selected")
+            # Add results visualization placeholder
+            gr.Markdown("### Results Visualization")
+            gr.Markdown("*Visualization features will be added in a future update.*")
+            # Connect events
+            refresh_results_btn.click(get_result_stats, outputs=[result_stats])
+            # Function to prepare all results for download as ZIP
+            def prepare_all_results():
+                import zipfile
+                zip_path = "mteb_human_eval_results.zip"
+                with zipfile.ZipFile(zip_path, 'w') as zipf:
+                    for r in [f for f in os.listdir(".") if f.endswith("_human_results.json")]:
+                        zipf.write(r)
+                return zip_path
+            # Function to return a single result file
+            def get_selected_result(filename):
+                if not filename:
+                    return None
+                if os.path.exists(filename):
+                    return filename
+                return None
+            # Update dropdown when refreshing results
+            def update_result_dropdown():
+                return gr.Dropdown.update(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")])
+            refresh_results_btn.click(update_result_dropdown, outputs=[result_select])
+            download_all_btn.click(prepare_all_results, outputs=[gr.File(label="Download All Results")])
+            download_selected_btn.click(get_selected_result, inputs=[result_select], outputs=[gr.File(label="Download Selected Result")])
+if __name__ == "__main__":
+    demo.launch()