AvocadoMuffin commited on
Commit
a1ce4b0
Β·
verified Β·
1 Parent(s): 0f03dd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -10
app.py CHANGED
@@ -84,7 +84,7 @@ def run_evaluation(num_samples, progress=gr.Progress()):
84
  # Load model
85
  qa_pipeline, hf_token = evaluate_model()
86
  if qa_pipeline is None:
87
- return "❌ Failed to load model", "", ""
88
 
89
  progress(0.1, desc="Loading CUAD dataset...")
90
 
@@ -97,7 +97,7 @@ def run_evaluation(num_samples, progress=gr.Progress()):
97
  dataset = load_dataset("theatticusproject/cuad", trust_remote_code=True, token=hf_token)
98
  test_data = dataset["test"]
99
  except Exception as e2:
100
- return f"❌ Error loading dataset: {e2}", "", ""
101
 
102
  # Limit samples
103
  num_samples = min(num_samples, len(test_data))
@@ -147,11 +147,15 @@ def run_evaluation(num_samples, progress=gr.Progress()):
147
  })
148
 
149
  except Exception as e:
 
150
  continue
151
 
152
  progress(0.9, desc="Calculating final metrics...")
153
 
154
  # Calculate final metrics
 
 
 
155
  avg_exact_match = np.mean(exact_matches) * 100
156
  avg_f1_score = np.mean(f1_scores) * 100
157
 
@@ -178,7 +182,7 @@ def run_evaluation(num_samples, progress=gr.Progress()):
178
  # Create detailed results DataFrame
179
  df = pd.DataFrame(predictions)
180
 
181
- # Save results
182
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
183
  results_file = f"cuad_evaluation_results_{timestamp}.json"
184
 
@@ -192,8 +196,13 @@ def run_evaluation(num_samples, progress=gr.Progress()):
192
  "predictions": predictions
193
  }
194
 
195
- with open(results_file, "w") as f:
196
- json.dump(detailed_results, f, indent=2)
 
 
 
 
 
197
 
198
  progress(1.0, desc="βœ… Evaluation completed!")
199
 
@@ -256,7 +265,6 @@ def create_gradio_interface():
256
 
257
  with gr.Row():
258
  detailed_results = gr.Dataframe(
259
- headers=["Sample_ID", "Question", "Predicted_Answer", "Ground_Truth", "Exact_Match", "F1_Score", "Confidence"],
260
  label="Sample-by-Sample Results",
261
  interactive=False,
262
  wrap=True
@@ -269,14 +277,18 @@ def create_gradio_interface():
269
  )
270
 
271
  # Event handlers
 
 
 
 
 
 
 
272
  evaluate_btn.click(
273
- fn=run_evaluation,
274
  inputs=[num_samples],
275
  outputs=[results_summary, detailed_results, download_file],
276
  show_progress=True
277
- ).then(
278
- lambda: gr.update(visible=True),
279
- outputs=[download_file]
280
  )
281
 
282
  # Footer
 
84
  # Load model
85
  qa_pipeline, hf_token = evaluate_model()
86
  if qa_pipeline is None:
87
+ return "❌ Failed to load model", pd.DataFrame(), None
88
 
89
  progress(0.1, desc="Loading CUAD dataset...")
90
 
 
97
  dataset = load_dataset("theatticusproject/cuad", trust_remote_code=True, token=hf_token)
98
  test_data = dataset["test"]
99
  except Exception as e2:
100
+ return f"❌ Error loading dataset: {e2}", pd.DataFrame(), None
101
 
102
  # Limit samples
103
  num_samples = min(num_samples, len(test_data))
 
147
  })
148
 
149
  except Exception as e:
150
+ print(f"Error processing sample {i}: {e}")
151
  continue
152
 
153
  progress(0.9, desc="Calculating final metrics...")
154
 
155
  # Calculate final metrics
156
+ if len(exact_matches) == 0:
157
+ return "❌ No samples were successfully processed", pd.DataFrame(), None
158
+
159
  avg_exact_match = np.mean(exact_matches) * 100
160
  avg_f1_score = np.mean(f1_scores) * 100
161
 
 
182
  # Create detailed results DataFrame
183
  df = pd.DataFrame(predictions)
184
 
185
+ # Save results to file
186
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
187
  results_file = f"cuad_evaluation_results_{timestamp}.json"
188
 
 
196
  "predictions": predictions
197
  }
198
 
199
+ try:
200
+ with open(results_file, "w") as f:
201
+ json.dump(detailed_results, f, indent=2)
202
+ print(f"βœ“ Results saved to {results_file}")
203
+ except Exception as e:
204
+ print(f"⚠ Warning: Could not save results file: {e}")
205
+ results_file = None
206
 
207
  progress(1.0, desc="βœ… Evaluation completed!")
208
 
 
265
 
266
  with gr.Row():
267
  detailed_results = gr.Dataframe(
 
268
  label="Sample-by-Sample Results",
269
  interactive=False,
270
  wrap=True
 
277
  )
278
 
279
  # Event handlers
280
+ def handle_evaluation(num_samples):
281
+ summary, df, file_path = run_evaluation(num_samples)
282
+ if file_path and os.path.exists(file_path):
283
+ return summary, df, gr.update(visible=True, value=file_path)
284
+ else:
285
+ return summary, df, gr.update(visible=False)
286
+
287
  evaluate_btn.click(
288
+ fn=handle_evaluation,
289
  inputs=[num_samples],
290
  outputs=[results_summary, detailed_results, download_file],
291
  show_progress=True
 
 
 
292
  )
293
 
294
  # Footer