Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -84,7 +84,7 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
84 |
# Load model
|
85 |
qa_pipeline, hf_token = evaluate_model()
|
86 |
if qa_pipeline is None:
|
87 |
-
return "β Failed to load model",
|
88 |
|
89 |
progress(0.1, desc="Loading CUAD dataset...")
|
90 |
|
@@ -97,7 +97,7 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
97 |
dataset = load_dataset("theatticusproject/cuad", trust_remote_code=True, token=hf_token)
|
98 |
test_data = dataset["test"]
|
99 |
except Exception as e2:
|
100 |
-
return f"β Error loading dataset: {e2}",
|
101 |
|
102 |
# Limit samples
|
103 |
num_samples = min(num_samples, len(test_data))
|
@@ -147,11 +147,15 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
147 |
})
|
148 |
|
149 |
except Exception as e:
|
|
|
150 |
continue
|
151 |
|
152 |
progress(0.9, desc="Calculating final metrics...")
|
153 |
|
154 |
# Calculate final metrics
|
|
|
|
|
|
|
155 |
avg_exact_match = np.mean(exact_matches) * 100
|
156 |
avg_f1_score = np.mean(f1_scores) * 100
|
157 |
|
@@ -178,7 +182,7 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
178 |
# Create detailed results DataFrame
|
179 |
df = pd.DataFrame(predictions)
|
180 |
|
181 |
-
# Save results
|
182 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
183 |
results_file = f"cuad_evaluation_results_{timestamp}.json"
|
184 |
|
@@ -192,8 +196,13 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
192 |
"predictions": predictions
|
193 |
}
|
194 |
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
progress(1.0, desc="β
Evaluation completed!")
|
199 |
|
@@ -256,7 +265,6 @@ def create_gradio_interface():
|
|
256 |
|
257 |
with gr.Row():
|
258 |
detailed_results = gr.Dataframe(
|
259 |
-
headers=["Sample_ID", "Question", "Predicted_Answer", "Ground_Truth", "Exact_Match", "F1_Score", "Confidence"],
|
260 |
label="Sample-by-Sample Results",
|
261 |
interactive=False,
|
262 |
wrap=True
|
@@ -269,14 +277,18 @@ def create_gradio_interface():
|
|
269 |
)
|
270 |
|
271 |
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
evaluate_btn.click(
|
273 |
-
fn=
|
274 |
inputs=[num_samples],
|
275 |
outputs=[results_summary, detailed_results, download_file],
|
276 |
show_progress=True
|
277 |
-
).then(
|
278 |
-
lambda: gr.update(visible=True),
|
279 |
-
outputs=[download_file]
|
280 |
)
|
281 |
|
282 |
# Footer
|
|
|
84 |
# Load model
|
85 |
qa_pipeline, hf_token = evaluate_model()
|
86 |
if qa_pipeline is None:
|
87 |
+
return "β Failed to load model", pd.DataFrame(), None
|
88 |
|
89 |
progress(0.1, desc="Loading CUAD dataset...")
|
90 |
|
|
|
97 |
dataset = load_dataset("theatticusproject/cuad", trust_remote_code=True, token=hf_token)
|
98 |
test_data = dataset["test"]
|
99 |
except Exception as e2:
|
100 |
+
return f"β Error loading dataset: {e2}", pd.DataFrame(), None
|
101 |
|
102 |
# Limit samples
|
103 |
num_samples = min(num_samples, len(test_data))
|
|
|
147 |
})
|
148 |
|
149 |
except Exception as e:
|
150 |
+
print(f"Error processing sample {i}: {e}")
|
151 |
continue
|
152 |
|
153 |
progress(0.9, desc="Calculating final metrics...")
|
154 |
|
155 |
# Calculate final metrics
|
156 |
+
if len(exact_matches) == 0:
|
157 |
+
return "β No samples were successfully processed", pd.DataFrame(), None
|
158 |
+
|
159 |
avg_exact_match = np.mean(exact_matches) * 100
|
160 |
avg_f1_score = np.mean(f1_scores) * 100
|
161 |
|
|
|
182 |
# Create detailed results DataFrame
|
183 |
df = pd.DataFrame(predictions)
|
184 |
|
185 |
+
# Save results to file
|
186 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
187 |
results_file = f"cuad_evaluation_results_{timestamp}.json"
|
188 |
|
|
|
196 |
"predictions": predictions
|
197 |
}
|
198 |
|
199 |
+
try:
|
200 |
+
with open(results_file, "w") as f:
|
201 |
+
json.dump(detailed_results, f, indent=2)
|
202 |
+
print(f"β Results saved to {results_file}")
|
203 |
+
except Exception as e:
|
204 |
+
print(f"β Warning: Could not save results file: {e}")
|
205 |
+
results_file = None
|
206 |
|
207 |
progress(1.0, desc="β
Evaluation completed!")
|
208 |
|
|
|
265 |
|
266 |
with gr.Row():
|
267 |
detailed_results = gr.Dataframe(
|
|
|
268 |
label="Sample-by-Sample Results",
|
269 |
interactive=False,
|
270 |
wrap=True
|
|
|
277 |
)
|
278 |
|
279 |
# Event handlers
|
280 |
+
def handle_evaluation(num_samples):
|
281 |
+
summary, df, file_path = run_evaluation(num_samples)
|
282 |
+
if file_path and os.path.exists(file_path):
|
283 |
+
return summary, df, gr.update(visible=True, value=file_path)
|
284 |
+
else:
|
285 |
+
return summary, df, gr.update(visible=False)
|
286 |
+
|
287 |
evaluate_btn.click(
|
288 |
+
fn=handle_evaluation,
|
289 |
inputs=[num_samples],
|
290 |
outputs=[results_summary, detailed_results, download_file],
|
291 |
show_progress=True
|
|
|
|
|
|
|
292 |
)
|
293 |
|
294 |
# Footer
|