Spaces:

AvocadoMuffin
/

eval_model

Running

App Files Files Community

AvocadoMuffin commited on 4 days ago

Commit

5ad87e4

verified ·

1 Parent(s): 8663fbd

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -33

app.py CHANGED Viewed

@@ -53,6 +53,14 @@ def exact_match_score(prediction, ground_truth):
     """Calculate exact match score"""
     return normalize_answer(prediction) == normalize_answer(ground_truth)
 def evaluate_model():
     # Authenticate with Hugging Face using the token
     hf_token = os.getenv("EVAL_TOKEN")
@@ -78,6 +86,21 @@ def evaluate_model():
         print(f"✗ Error loading model: {e}")
         return None, None
 def run_evaluation(num_samples, progress=gr.Progress()):
     """Run evaluation and return results for Gradio interface"""
@@ -88,20 +111,33 @@ def run_evaluation(num_samples, progress=gr.Progress()):
     progress(0.1, desc="Loading CUAD dataset...")
-    # Load dataset - use QA format version (JSON, no PDFs)
     try:
-        # Try the QA-specific version first (much faster, JSON format)
-        dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True, token=hf_token)
         test_data = dataset["test"]
-        print(f"✓ Loaded CUAD-QA dataset with {len(test_data)} samples")
     except Exception as e:
         try:
-            # Fallback to original but limit to avoid PDF downloads
-            dataset = load_dataset("cuad", split="test[:1000]", trust_remote_code=True, token=hf_token)
-            test_data = dataset
-            print(f"✓ Loaded CUAD dataset with {len(test_data)} samples")
         except Exception as e2:
-            return f"❌ Error loading dataset: {e2}", pd.DataFrame(), None
     # Limit samples
     num_samples = min(num_samples, len(test_data))
@@ -119,23 +155,53 @@ def run_evaluation(num_samples, progress=gr.Progress()):
         progress((0.2 + 0.7 * i / num_samples), desc=f"Processing sample {i+1}/{num_samples}")
         try:
-            context = example["context"]
-            question = example["question"]
-            answers = example["answers"]
-            # Get model prediction
-            result = qa_pipeline(question=question, context=context)
-            predicted_answer = result["answer"]
-            # Get ground truth answers
-            if answers["text"] and len(answers["text"]) > 0:
-                ground_truth = answers["text"][0] if isinstance(answers["text"], list) else answers["text"]
-            else:
-                ground_truth = ""
-            # Calculate metrics
-            em = exact_match_score(predicted_answer, ground_truth)
-            f1 = f1_score_qa(predicted_answer, ground_truth)
             exact_matches.append(em)
             f1_scores.append(f1)
@@ -143,11 +209,12 @@ def run_evaluation(num_samples, progress=gr.Progress()):
             predictions.append({
                 "Sample_ID": i+1,
                 "Question": question[:100] + "..." if len(question) > 100 else question,
-                "Predicted_Answer": predicted_answer,
-                "Ground_Truth": ground_truth,
                 "Exact_Match": em,
                 "F1_Score": round(f1, 3),
-                "Confidence": round(result["score"], 3)
             })
         except Exception as e:
@@ -163,24 +230,36 @@ def run_evaluation(num_samples, progress=gr.Progress()):
     avg_exact_match = np.mean(exact_matches) * 100
     avg_f1_score = np.mean(f1_scores) * 100
     # Create results summary
     results_summary = f"""
 # 📊 CUAD Model Evaluation Results
 ## 🎯 Overall Performance
 - **Model**: AvocadoMuffin/roberta-cuad-qa-v3
 - **Dataset**: CUAD (Contract Understanding Atticus Dataset)
 - **Samples Evaluated**: {len(exact_matches)}
 - **Evaluation Date**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
-## 📈 Metrics
 - **Exact Match Score**: {avg_exact_match:.2f}%
 - **F1 Score**: {avg_f1_score:.2f}%
 ## 🔍 Performance Analysis
-- **High Confidence Predictions**: {len([p for p in predictions if p['Confidence'] > 0.8])} ({len([p for p in predictions if p['Confidence'] > 0.8])/len(predictions)*100:.1f}%)
-- **Perfect Matches**: {len([p for p in predictions if p['Exact_Match'] == 1])} ({len([p for p in predictions if p['Exact_Match'] == 1])/len(predictions)*100:.1f}%)
-- **High F1 Scores (>0.8)**: {len([p for p in predictions if p['F1_Score'] > 0.8])} ({len([p for p in predictions if p['F1_Score'] > 0.8])/len(predictions)*100:.1f}%)
 """
     # Create detailed results DataFrame
@@ -197,7 +276,13 @@ def run_evaluation(num_samples, progress=gr.Progress()):
         "exact_match_score": avg_exact_match,
         "f1_score": avg_f1_score,
         "evaluation_date": datetime.now().isoformat(),
-        "predictions": predictions
     }
     try:
@@ -220,7 +305,7 @@ def create_gradio_interface():
         <div style="text-align: center; padding: 20px;">
             <h1>🏛️ CUAD Model Evaluation Dashboard</h1>
             <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
-            <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v2</p>
         </div>
         """)
@@ -250,6 +335,7 @@ def create_gradio_interface():
                         <li><strong>Exact Match</strong>: Percentage of perfect predictions</li>
                         <li><strong>F1 Score</strong>: Token-level overlap between prediction and ground truth</li>
                         <li><strong>Confidence</strong>: Model's confidence in its predictions</li>
                     </ul>
                 </div>
                 """)

     """Calculate exact match score"""
     return normalize_answer(prediction) == normalize_answer(ground_truth)
+def max_over_ground_truths(metric_fn, prediction, ground_truths):
+    """Calculate maximum score over all ground truth answers"""
+    scores = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores.append(score)
+    return max(scores) if scores else 0
 def evaluate_model():
     # Authenticate with Hugging Face using the token
     hf_token = os.getenv("EVAL_TOKEN")
         print(f"✗ Error loading model: {e}")
         return None, None
+def inspect_dataset_structure(dataset, num_samples=3):
+    """Inspect dataset structure for debugging"""
+    print(f"Dataset structure inspection:")
+    print(f"Dataset type: {type(dataset)}")
+    print(f"Dataset length: {len(dataset)}")
+    if len(dataset) > 0:
+        sample = dataset[0]
+        print(f"Sample keys: {list(sample.keys()) if isinstance(sample, dict) else 'Not a dict'}")
+        print(f"Sample structure:")
+        for key, value in sample.items():
+            print(f"  {key}: {type(value)} - {str(value)[:100]}...")
+    return dataset
 def run_evaluation(num_samples, progress=gr.Progress()):
     """Run evaluation and return results for Gradio interface"""
     progress(0.1, desc="Loading CUAD dataset...")
+    # Load dataset - try multiple approaches
+    dataset = None
+    test_data = None
     try:
+        # Try cuad dataset directly
+        print("Attempting to load CUAD dataset...")
+        dataset = load_dataset("cuad", token=hf_token)
         test_data = dataset["test"]
+        print(f"✓ Loaded CUAD dataset with {len(test_data)} samples")
+        # Inspect structure
+        test_data = inspect_dataset_structure(test_data)
     except Exception as e:
+        print(f"Error loading CUAD dataset: {e}")
         try:
+            # Try squad format as fallback
+            print("Trying SQuAD format...")
+            dataset = load_dataset("squad", split="validation", token=hf_token)
+            test_data = dataset.select(range(min(1000, len(dataset))))
+            print(f"✓ Loaded SQuAD dataset as fallback with {len(test_data)} samples")
         except Exception as e2:
+            return f"❌ Error loading any dataset: {e2}", pd.DataFrame(), None
+    if test_data is None:
+        return "❌ No test data available", pd.DataFrame(), None
     # Limit samples
     num_samples = min(num_samples, len(test_data))
         progress((0.2 + 0.7 * i / num_samples), desc=f"Processing sample {i+1}/{num_samples}")
         try:
+            # Handle different dataset formats
+            if "context" in example:
+                context = example["context"]
+            elif "text" in example:
+                context = example["text"]
+            else:
+                print(f"Warning: No context found in sample {i}")
+                continue
+            if "question" in example:
+                question = example["question"]
+            elif "title" in example:
+                question = example["title"]
+            else:
+                print(f"Warning: No question found in sample {i}")
+                continue
+            # Handle answers field
+            ground_truths = []
+            if "answers" in example:
+                answers = example["answers"]
+                if isinstance(answers, dict):
+                    if "text" in answers:
+                        if isinstance(answers["text"], list):
+                            ground_truths = [ans for ans in answers["text"] if ans.strip()]
+                        else:
+                            ground_truths = [answers["text"]] if answers["text"].strip() else []
+                elif isinstance(answers, list):
+                    ground_truths = answers
+            # Skip if no ground truth
+            if not ground_truths:
+                print(f"Warning: No ground truth found for sample {i}")
+                continue
+            # Get model prediction
+            try:
+                result = qa_pipeline(question=question, context=context)
+                predicted_answer = result["answer"]
+                confidence = result["score"]
+            except Exception as e:
+                print(f"Error getting prediction for sample {i}: {e}")
+                continue
+            # Calculate metrics using max over ground truths
+            em = max_over_ground_truths(exact_match_score, predicted_answer, ground_truths)
+            f1 = max_over_ground_truths(f1_score_qa, predicted_answer, ground_truths)
             exact_matches.append(em)
             f1_scores.append(f1)
             predictions.append({
                 "Sample_ID": i+1,
                 "Question": question[:100] + "..." if len(question) > 100 else question,
+                "Predicted_Answer": predicted_answer[:100] + "..." if len(predicted_answer) > 100 else predicted_answer,
+                "Ground_Truth": ground_truths[0][:100] + "..." if len(ground_truths[0]) > 100 else ground_truths[0],
+                "Num_Ground_Truths": len(ground_truths),
                 "Exact_Match": em,
                 "F1_Score": round(f1, 3),
+                "Confidence": round(confidence, 3)
             })
         except Exception as e:
     avg_exact_match = np.mean(exact_matches) * 100
     avg_f1_score = np.mean(f1_scores) * 100
+    # Calculate additional statistics
+    high_confidence_samples = [p for p in predictions if p['Confidence'] > 0.8]
+    perfect_matches = [p for p in predictions if p['Exact_Match'] == 1]
+    high_f1_samples = [p for p in predictions if p['F1_Score'] > 0.8]
     # Create results summary
     results_summary = f"""
 # 📊 CUAD Model Evaluation Results
 ## 🎯 Overall Performance
 - **Model**: AvocadoMuffin/roberta-cuad-qa-v3
 - **Dataset**: CUAD (Contract Understanding Atticus Dataset)
 - **Samples Evaluated**: {len(exact_matches)}
 - **Evaluation Date**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
+## 📈 Core Metrics
 - **Exact Match Score**: {avg_exact_match:.2f}%
 - **F1 Score**: {avg_f1_score:.2f}%
 ## 🔍 Performance Analysis
+- **High Confidence Predictions (>0.8)**: {len(high_confidence_samples)} ({len(high_confidence_samples)/len(predictions)*100:.1f}%)
+- **Perfect Matches**: {len(perfect_matches)} ({len(perfect_matches)/len(predictions)*100:.1f}%)
+- **High F1 Scores (>0.8)**: {len(high_f1_samples)} ({len(high_f1_samples)/len(predictions)*100:.1f}%)
+## 📊 Distribution
+- **Average Confidence**: {np.mean([p['Confidence'] for p in predictions]):.3f}
+- **Median F1 Score**: {np.median([p['F1_Score'] for p in predictions]):.3f}
+- **Samples with Multiple Ground Truths**: {len([p for p in predictions if p['Num_Ground_Truths'] > 1])}
+## 🎯 Evaluation Quality
+The evaluation accounts for multiple ground truth answers where available, using the maximum score across all valid answers for each question.
 """
     # Create detailed results DataFrame
         "exact_match_score": avg_exact_match,
         "f1_score": avg_f1_score,
         "evaluation_date": datetime.now().isoformat(),
+        "evaluation_methodology": "max_over_ground_truths",
+        "predictions": predictions,
+        "summary_stats": {
+            "avg_confidence": float(np.mean([p['Confidence'] for p in predictions])),
+            "median_f1": float(np.median([p['F1_Score'] for p in predictions])),
+            "samples_with_multiple_ground_truths": len([p for p in predictions if p['Num_Ground_Truths'] > 1])
+        }
     }
     try:
         <div style="text-align: center; padding: 20px;">
             <h1>🏛️ CUAD Model Evaluation Dashboard</h1>
             <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
+            <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v3</p>
         </div>
         """)
                         <li><strong>Exact Match</strong>: Percentage of perfect predictions</li>
                         <li><strong>F1 Score</strong>: Token-level overlap between prediction and ground truth</li>
                         <li><strong>Confidence</strong>: Model's confidence in its predictions</li>
+                        <li><strong>Max-over-GT</strong>: Best score across multiple ground truth answers</li>
                     </ul>
                 </div>
                 """)