AvocadoMuffin commited on
Commit
5ad87e4
Β·
verified Β·
1 Parent(s): 8663fbd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -33
app.py CHANGED
@@ -53,6 +53,14 @@ def exact_match_score(prediction, ground_truth):
53
  """Calculate exact match score"""
54
  return normalize_answer(prediction) == normalize_answer(ground_truth)
55
 
 
 
 
 
 
 
 
 
56
  def evaluate_model():
57
  # Authenticate with Hugging Face using the token
58
  hf_token = os.getenv("EVAL_TOKEN")
@@ -78,6 +86,21 @@ def evaluate_model():
78
  print(f"βœ— Error loading model: {e}")
79
  return None, None
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def run_evaluation(num_samples, progress=gr.Progress()):
82
  """Run evaluation and return results for Gradio interface"""
83
 
@@ -88,20 +111,33 @@ def run_evaluation(num_samples, progress=gr.Progress()):
88
 
89
  progress(0.1, desc="Loading CUAD dataset...")
90
 
91
- # Load dataset - use QA format version (JSON, no PDFs)
 
 
 
92
  try:
93
- # Try the QA-specific version first (much faster, JSON format)
94
- dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True, token=hf_token)
 
95
  test_data = dataset["test"]
96
- print(f"βœ“ Loaded CUAD-QA dataset with {len(test_data)} samples")
 
 
 
 
97
  except Exception as e:
 
98
  try:
99
- # Fallback to original but limit to avoid PDF downloads
100
- dataset = load_dataset("cuad", split="test[:1000]", trust_remote_code=True, token=hf_token)
101
- test_data = dataset
102
- print(f"βœ“ Loaded CUAD dataset with {len(test_data)} samples")
 
103
  except Exception as e2:
104
- return f"❌ Error loading dataset: {e2}", pd.DataFrame(), None
 
 
 
105
 
106
  # Limit samples
107
  num_samples = min(num_samples, len(test_data))
@@ -119,23 +155,53 @@ def run_evaluation(num_samples, progress=gr.Progress()):
119
  progress((0.2 + 0.7 * i / num_samples), desc=f"Processing sample {i+1}/{num_samples}")
120
 
121
  try:
122
- context = example["context"]
123
- question = example["question"]
124
- answers = example["answers"]
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- # Get model prediction
127
- result = qa_pipeline(question=question, context=context)
128
- predicted_answer = result["answer"]
 
 
 
 
 
 
 
 
 
129
 
130
- # Get ground truth answers
131
- if answers["text"] and len(answers["text"]) > 0:
132
- ground_truth = answers["text"][0] if isinstance(answers["text"], list) else answers["text"]
133
- else:
134
- ground_truth = ""
 
 
 
 
 
 
 
 
135
 
136
- # Calculate metrics
137
- em = exact_match_score(predicted_answer, ground_truth)
138
- f1 = f1_score_qa(predicted_answer, ground_truth)
139
 
140
  exact_matches.append(em)
141
  f1_scores.append(f1)
@@ -143,11 +209,12 @@ def run_evaluation(num_samples, progress=gr.Progress()):
143
  predictions.append({
144
  "Sample_ID": i+1,
145
  "Question": question[:100] + "..." if len(question) > 100 else question,
146
- "Predicted_Answer": predicted_answer,
147
- "Ground_Truth": ground_truth,
 
148
  "Exact_Match": em,
149
  "F1_Score": round(f1, 3),
150
- "Confidence": round(result["score"], 3)
151
  })
152
 
153
  except Exception as e:
@@ -163,24 +230,36 @@ def run_evaluation(num_samples, progress=gr.Progress()):
163
  avg_exact_match = np.mean(exact_matches) * 100
164
  avg_f1_score = np.mean(f1_scores) * 100
165
 
 
 
 
 
 
166
  # Create results summary
167
  results_summary = f"""
168
  # πŸ“Š CUAD Model Evaluation Results
169
-
170
  ## 🎯 Overall Performance
171
  - **Model**: AvocadoMuffin/roberta-cuad-qa-v3
172
  - **Dataset**: CUAD (Contract Understanding Atticus Dataset)
173
  - **Samples Evaluated**: {len(exact_matches)}
174
  - **Evaluation Date**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
175
 
176
- ## πŸ“ˆ Metrics
177
  - **Exact Match Score**: {avg_exact_match:.2f}%
178
  - **F1 Score**: {avg_f1_score:.2f}%
179
 
180
  ## πŸ” Performance Analysis
181
- - **High Confidence Predictions**: {len([p for p in predictions if p['Confidence'] > 0.8])} ({len([p for p in predictions if p['Confidence'] > 0.8])/len(predictions)*100:.1f}%)
182
- - **Perfect Matches**: {len([p for p in predictions if p['Exact_Match'] == 1])} ({len([p for p in predictions if p['Exact_Match'] == 1])/len(predictions)*100:.1f}%)
183
- - **High F1 Scores (>0.8)**: {len([p for p in predictions if p['F1_Score'] > 0.8])} ({len([p for p in predictions if p['F1_Score'] > 0.8])/len(predictions)*100:.1f}%)
 
 
 
 
 
 
 
 
184
  """
185
 
186
  # Create detailed results DataFrame
@@ -197,7 +276,13 @@ def run_evaluation(num_samples, progress=gr.Progress()):
197
  "exact_match_score": avg_exact_match,
198
  "f1_score": avg_f1_score,
199
  "evaluation_date": datetime.now().isoformat(),
200
- "predictions": predictions
 
 
 
 
 
 
201
  }
202
 
203
  try:
@@ -220,7 +305,7 @@ def create_gradio_interface():
220
  <div style="text-align: center; padding: 20px;">
221
  <h1>πŸ›οΈ CUAD Model Evaluation Dashboard</h1>
222
  <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
223
- <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v2</p>
224
  </div>
225
  """)
226
 
@@ -250,6 +335,7 @@ def create_gradio_interface():
250
  <li><strong>Exact Match</strong>: Percentage of perfect predictions</li>
251
  <li><strong>F1 Score</strong>: Token-level overlap between prediction and ground truth</li>
252
  <li><strong>Confidence</strong>: Model's confidence in its predictions</li>
 
253
  </ul>
254
  </div>
255
  """)
 
53
  """Calculate exact match score"""
54
  return normalize_answer(prediction) == normalize_answer(ground_truth)
55
 
56
+ def max_over_ground_truths(metric_fn, prediction, ground_truths):
57
+ """Calculate maximum score over all ground truth answers"""
58
+ scores = []
59
+ for ground_truth in ground_truths:
60
+ score = metric_fn(prediction, ground_truth)
61
+ scores.append(score)
62
+ return max(scores) if scores else 0
63
+
64
  def evaluate_model():
65
  # Authenticate with Hugging Face using the token
66
  hf_token = os.getenv("EVAL_TOKEN")
 
86
  print(f"βœ— Error loading model: {e}")
87
  return None, None
88
 
89
+ def inspect_dataset_structure(dataset, num_samples=3):
90
+ """Inspect dataset structure for debugging"""
91
+ print(f"Dataset structure inspection:")
92
+ print(f"Dataset type: {type(dataset)}")
93
+ print(f"Dataset length: {len(dataset)}")
94
+
95
+ if len(dataset) > 0:
96
+ sample = dataset[0]
97
+ print(f"Sample keys: {list(sample.keys()) if isinstance(sample, dict) else 'Not a dict'}")
98
+ print(f"Sample structure:")
99
+ for key, value in sample.items():
100
+ print(f" {key}: {type(value)} - {str(value)[:100]}...")
101
+
102
+ return dataset
103
+
104
  def run_evaluation(num_samples, progress=gr.Progress()):
105
  """Run evaluation and return results for Gradio interface"""
106
 
 
111
 
112
  progress(0.1, desc="Loading CUAD dataset...")
113
 
114
+ # Load dataset - try multiple approaches
115
+ dataset = None
116
+ test_data = None
117
+
118
  try:
119
+ # Try cuad dataset directly
120
+ print("Attempting to load CUAD dataset...")
121
+ dataset = load_dataset("cuad", token=hf_token)
122
  test_data = dataset["test"]
123
+ print(f"βœ“ Loaded CUAD dataset with {len(test_data)} samples")
124
+
125
+ # Inspect structure
126
+ test_data = inspect_dataset_structure(test_data)
127
+
128
  except Exception as e:
129
+ print(f"Error loading CUAD dataset: {e}")
130
  try:
131
+ # Try squad format as fallback
132
+ print("Trying SQuAD format...")
133
+ dataset = load_dataset("squad", split="validation", token=hf_token)
134
+ test_data = dataset.select(range(min(1000, len(dataset))))
135
+ print(f"βœ“ Loaded SQuAD dataset as fallback with {len(test_data)} samples")
136
  except Exception as e2:
137
+ return f"❌ Error loading any dataset: {e2}", pd.DataFrame(), None
138
+
139
+ if test_data is None:
140
+ return "❌ No test data available", pd.DataFrame(), None
141
 
142
  # Limit samples
143
  num_samples = min(num_samples, len(test_data))
 
155
  progress((0.2 + 0.7 * i / num_samples), desc=f"Processing sample {i+1}/{num_samples}")
156
 
157
  try:
158
+ # Handle different dataset formats
159
+ if "context" in example:
160
+ context = example["context"]
161
+ elif "text" in example:
162
+ context = example["text"]
163
+ else:
164
+ print(f"Warning: No context found in sample {i}")
165
+ continue
166
+
167
+ if "question" in example:
168
+ question = example["question"]
169
+ elif "title" in example:
170
+ question = example["title"]
171
+ else:
172
+ print(f"Warning: No question found in sample {i}")
173
+ continue
174
 
175
+ # Handle answers field
176
+ ground_truths = []
177
+ if "answers" in example:
178
+ answers = example["answers"]
179
+ if isinstance(answers, dict):
180
+ if "text" in answers:
181
+ if isinstance(answers["text"], list):
182
+ ground_truths = [ans for ans in answers["text"] if ans.strip()]
183
+ else:
184
+ ground_truths = [answers["text"]] if answers["text"].strip() else []
185
+ elif isinstance(answers, list):
186
+ ground_truths = answers
187
 
188
+ # Skip if no ground truth
189
+ if not ground_truths:
190
+ print(f"Warning: No ground truth found for sample {i}")
191
+ continue
192
+
193
+ # Get model prediction
194
+ try:
195
+ result = qa_pipeline(question=question, context=context)
196
+ predicted_answer = result["answer"]
197
+ confidence = result["score"]
198
+ except Exception as e:
199
+ print(f"Error getting prediction for sample {i}: {e}")
200
+ continue
201
 
202
+ # Calculate metrics using max over ground truths
203
+ em = max_over_ground_truths(exact_match_score, predicted_answer, ground_truths)
204
+ f1 = max_over_ground_truths(f1_score_qa, predicted_answer, ground_truths)
205
 
206
  exact_matches.append(em)
207
  f1_scores.append(f1)
 
209
  predictions.append({
210
  "Sample_ID": i+1,
211
  "Question": question[:100] + "..." if len(question) > 100 else question,
212
+ "Predicted_Answer": predicted_answer[:100] + "..." if len(predicted_answer) > 100 else predicted_answer,
213
+ "Ground_Truth": ground_truths[0][:100] + "..." if len(ground_truths[0]) > 100 else ground_truths[0],
214
+ "Num_Ground_Truths": len(ground_truths),
215
  "Exact_Match": em,
216
  "F1_Score": round(f1, 3),
217
+ "Confidence": round(confidence, 3)
218
  })
219
 
220
  except Exception as e:
 
230
  avg_exact_match = np.mean(exact_matches) * 100
231
  avg_f1_score = np.mean(f1_scores) * 100
232
 
233
+ # Calculate additional statistics
234
+ high_confidence_samples = [p for p in predictions if p['Confidence'] > 0.8]
235
+ perfect_matches = [p for p in predictions if p['Exact_Match'] == 1]
236
+ high_f1_samples = [p for p in predictions if p['F1_Score'] > 0.8]
237
+
238
  # Create results summary
239
  results_summary = f"""
240
  # πŸ“Š CUAD Model Evaluation Results
 
241
  ## 🎯 Overall Performance
242
  - **Model**: AvocadoMuffin/roberta-cuad-qa-v3
243
  - **Dataset**: CUAD (Contract Understanding Atticus Dataset)
244
  - **Samples Evaluated**: {len(exact_matches)}
245
  - **Evaluation Date**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
246
 
247
+ ## πŸ“ˆ Core Metrics
248
  - **Exact Match Score**: {avg_exact_match:.2f}%
249
  - **F1 Score**: {avg_f1_score:.2f}%
250
 
251
  ## πŸ” Performance Analysis
252
+ - **High Confidence Predictions (>0.8)**: {len(high_confidence_samples)} ({len(high_confidence_samples)/len(predictions)*100:.1f}%)
253
+ - **Perfect Matches**: {len(perfect_matches)} ({len(perfect_matches)/len(predictions)*100:.1f}%)
254
+ - **High F1 Scores (>0.8)**: {len(high_f1_samples)} ({len(high_f1_samples)/len(predictions)*100:.1f}%)
255
+
256
+ ## πŸ“Š Distribution
257
+ - **Average Confidence**: {np.mean([p['Confidence'] for p in predictions]):.3f}
258
+ - **Median F1 Score**: {np.median([p['F1_Score'] for p in predictions]):.3f}
259
+ - **Samples with Multiple Ground Truths**: {len([p for p in predictions if p['Num_Ground_Truths'] > 1])}
260
+
261
+ ## 🎯 Evaluation Quality
262
+ The evaluation accounts for multiple ground truth answers where available, using the maximum score across all valid answers for each question.
263
  """
264
 
265
  # Create detailed results DataFrame
 
276
  "exact_match_score": avg_exact_match,
277
  "f1_score": avg_f1_score,
278
  "evaluation_date": datetime.now().isoformat(),
279
+ "evaluation_methodology": "max_over_ground_truths",
280
+ "predictions": predictions,
281
+ "summary_stats": {
282
+ "avg_confidence": float(np.mean([p['Confidence'] for p in predictions])),
283
+ "median_f1": float(np.median([p['F1_Score'] for p in predictions])),
284
+ "samples_with_multiple_ground_truths": len([p for p in predictions if p['Num_Ground_Truths'] > 1])
285
+ }
286
  }
287
 
288
  try:
 
305
  <div style="text-align: center; padding: 20px;">
306
  <h1>πŸ›οΈ CUAD Model Evaluation Dashboard</h1>
307
  <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
308
+ <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v3</p>
309
  </div>
310
  """)
311
 
 
335
  <li><strong>Exact Match</strong>: Percentage of perfect predictions</li>
336
  <li><strong>F1 Score</strong>: Token-level overlap between prediction and ground truth</li>
337
  <li><strong>Confidence</strong>: Model's confidence in its predictions</li>
338
+ <li><strong>Max-over-GT</strong>: Best score across multiple ground truth answers</li>
339
  </ul>
340
  </div>
341
  """)