AvocadoMuffin commited on
Commit
0f03dd5
Β·
verified Β·
1 Parent(s): c04ded8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +309 -0
app.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ from datasets import load_dataset
5
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
6
+ import torch
7
+ from sklearn.metrics import f1_score
8
+ import re
9
+ from collections import Counter
10
+ import string
11
+ from huggingface_hub import login
12
+ import gradio as gr
13
+ import pandas as pd
14
+ from datetime import datetime
15
+
16
+ def normalize_answer(s):
17
+ """Normalize answer for evaluation"""
18
+ def remove_articles(text):
19
+ return re.sub(r'\b(a|an|the)\b', ' ', text)
20
+
21
+ def white_space_fix(text):
22
+ return ' '.join(text.split())
23
+
24
+ def remove_punc(text):
25
+ exclude = set(string.punctuation)
26
+ return ''.join(ch for ch in text if ch not in exclude)
27
+
28
+ def lower(text):
29
+ return text.lower()
30
+
31
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
32
+
33
+ def f1_score_qa(prediction, ground_truth):
34
+ """Calculate F1 score for QA"""
35
+ prediction_tokens = normalize_answer(prediction).split()
36
+ ground_truth_tokens = normalize_answer(ground_truth).split()
37
+
38
+ if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
39
+ return int(prediction_tokens == ground_truth_tokens)
40
+
41
+ common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
42
+ num_same = sum(common.values())
43
+
44
+ if num_same == 0:
45
+ return 0
46
+
47
+ precision = 1.0 * num_same / len(prediction_tokens)
48
+ recall = 1.0 * num_same / len(ground_truth_tokens)
49
+ f1 = (2 * precision * recall) / (precision + recall)
50
+ return f1
51
+
52
+ def exact_match_score(prediction, ground_truth):
53
+ """Calculate exact match score"""
54
+ return normalize_answer(prediction) == normalize_answer(ground_truth)
55
+
56
+ def evaluate_model():
57
+ # Authenticate with Hugging Face using the token
58
+ hf_token = os.getenv("EVAL_TOKEN")
59
+ if hf_token:
60
+ try:
61
+ login(token=hf_token)
62
+ print("βœ“ Authenticated with Hugging Face")
63
+ except Exception as e:
64
+ print(f"⚠ Warning: Could not authenticate with HF token: {e}")
65
+ else:
66
+ print("⚠ Warning: EVAL_TOKEN not found in environment variables")
67
+
68
+ print("Loading model and tokenizer...")
69
+ model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
70
+
71
+ try:
72
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
73
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
74
+ qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
75
+ print("βœ“ Model loaded successfully")
76
+ return qa_pipeline, hf_token
77
+ except Exception as e:
78
+ print(f"βœ— Error loading model: {e}")
79
+ return None, None
80
+
81
+ def run_evaluation(num_samples, progress=gr.Progress()):
82
+ """Run evaluation and return results for Gradio interface"""
83
+
84
+ # Load model
85
+ qa_pipeline, hf_token = evaluate_model()
86
+ if qa_pipeline is None:
87
+ return "❌ Failed to load model", "", ""
88
+
89
+ progress(0.1, desc="Loading CUAD dataset...")
90
+
91
+ # Load dataset
92
+ try:
93
+ dataset = load_dataset("cuad", trust_remote_code=True, token=hf_token)
94
+ test_data = dataset["test"]
95
+ except Exception as e:
96
+ try:
97
+ dataset = load_dataset("theatticusproject/cuad", trust_remote_code=True, token=hf_token)
98
+ test_data = dataset["test"]
99
+ except Exception as e2:
100
+ return f"❌ Error loading dataset: {e2}", "", ""
101
+
102
+ # Limit samples
103
+ num_samples = min(num_samples, len(test_data))
104
+ test_subset = test_data.select(range(num_samples))
105
+
106
+ progress(0.2, desc=f"Starting evaluation on {num_samples} samples...")
107
+
108
+ # Initialize metrics
109
+ exact_matches = []
110
+ f1_scores = []
111
+ predictions = []
112
+
113
+ # Run evaluation
114
+ for i, example in enumerate(test_subset):
115
+ progress((0.2 + 0.7 * i / num_samples), desc=f"Processing sample {i+1}/{num_samples}")
116
+
117
+ try:
118
+ context = example["context"]
119
+ question = example["question"]
120
+ answers = example["answers"]
121
+
122
+ # Get model prediction
123
+ result = qa_pipeline(question=question, context=context)
124
+ predicted_answer = result["answer"]
125
+
126
+ # Get ground truth answers
127
+ if answers["text"] and len(answers["text"]) > 0:
128
+ ground_truth = answers["text"][0] if isinstance(answers["text"], list) else answers["text"]
129
+ else:
130
+ ground_truth = ""
131
+
132
+ # Calculate metrics
133
+ em = exact_match_score(predicted_answer, ground_truth)
134
+ f1 = f1_score_qa(predicted_answer, ground_truth)
135
+
136
+ exact_matches.append(em)
137
+ f1_scores.append(f1)
138
+
139
+ predictions.append({
140
+ "Sample_ID": i+1,
141
+ "Question": question[:100] + "..." if len(question) > 100 else question,
142
+ "Predicted_Answer": predicted_answer,
143
+ "Ground_Truth": ground_truth,
144
+ "Exact_Match": em,
145
+ "F1_Score": round(f1, 3),
146
+ "Confidence": round(result["score"], 3)
147
+ })
148
+
149
+ except Exception as e:
150
+ continue
151
+
152
+ progress(0.9, desc="Calculating final metrics...")
153
+
154
+ # Calculate final metrics
155
+ avg_exact_match = np.mean(exact_matches) * 100
156
+ avg_f1_score = np.mean(f1_scores) * 100
157
+
158
+ # Create results summary
159
+ results_summary = f"""
160
+ # πŸ“Š CUAD Model Evaluation Results
161
+
162
+ ## 🎯 Overall Performance
163
+ - **Model**: AvocadoMuffin/roberta-cuad-qa-v2
164
+ - **Dataset**: CUAD (Contract Understanding Atticus Dataset)
165
+ - **Samples Evaluated**: {len(exact_matches)}
166
+ - **Evaluation Date**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
167
+
168
+ ## πŸ“ˆ Metrics
169
+ - **Exact Match Score**: {avg_exact_match:.2f}%
170
+ - **F1 Score**: {avg_f1_score:.2f}%
171
+
172
+ ## πŸ” Performance Analysis
173
+ - **High Confidence Predictions**: {len([p for p in predictions if p['Confidence'] > 0.8])} ({len([p for p in predictions if p['Confidence'] > 0.8])/len(predictions)*100:.1f}%)
174
+ - **Perfect Matches**: {len([p for p in predictions if p['Exact_Match'] == 1])} ({len([p for p in predictions if p['Exact_Match'] == 1])/len(predictions)*100:.1f}%)
175
+ - **High F1 Scores (>0.8)**: {len([p for p in predictions if p['F1_Score'] > 0.8])} ({len([p for p in predictions if p['F1_Score'] > 0.8])/len(predictions)*100:.1f}%)
176
+ """
177
+
178
+ # Create detailed results DataFrame
179
+ df = pd.DataFrame(predictions)
180
+
181
+ # Save results
182
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
183
+ results_file = f"cuad_evaluation_results_{timestamp}.json"
184
+
185
+ detailed_results = {
186
+ "model_name": "AvocadoMuffin/roberta-cuad-qa-v2",
187
+ "dataset": "cuad",
188
+ "num_samples": len(exact_matches),
189
+ "exact_match_score": avg_exact_match,
190
+ "f1_score": avg_f1_score,
191
+ "evaluation_date": datetime.now().isoformat(),
192
+ "predictions": predictions
193
+ }
194
+
195
+ with open(results_file, "w") as f:
196
+ json.dump(detailed_results, f, indent=2)
197
+
198
+ progress(1.0, desc="βœ… Evaluation completed!")
199
+
200
+ return results_summary, df, results_file
201
+
202
+ def create_gradio_interface():
203
+ """Create Gradio interface for CUAD evaluation"""
204
+
205
+ with gr.Blocks(title="CUAD Model Evaluator", theme=gr.themes.Soft()) as demo:
206
+ gr.HTML("""
207
+ <div style="text-align: center; padding: 20px;">
208
+ <h1>πŸ›οΈ CUAD Model Evaluation Dashboard</h1>
209
+ <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
210
+ <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v2</p>
211
+ </div>
212
+ """)
213
+
214
+ with gr.Row():
215
+ with gr.Column(scale=1):
216
+ gr.HTML("<h3>βš™οΈ Evaluation Settings</h3>")
217
+
218
+ num_samples = gr.Slider(
219
+ minimum=10,
220
+ maximum=500,
221
+ value=100,
222
+ step=10,
223
+ label="Number of samples to evaluate",
224
+ info="Choose between 10-500 samples (more samples = more accurate but slower)"
225
+ )
226
+
227
+ evaluate_btn = gr.Button(
228
+ "πŸš€ Start Evaluation",
229
+ variant="primary",
230
+ size="lg"
231
+ )
232
+
233
+ gr.HTML("""
234
+ <div style="margin-top: 20px; padding: 15px; background-color: #f0f0f0; border-radius: 8px;">
235
+ <h4>πŸ“‹ What this evaluates:</h4>
236
+ <ul>
237
+ <li><strong>Exact Match</strong>: Percentage of perfect predictions</li>
238
+ <li><strong>F1 Score</strong>: Token-level overlap between prediction and ground truth</li>
239
+ <li><strong>Confidence</strong>: Model's confidence in its predictions</li>
240
+ </ul>
241
+ </div>
242
+ """)
243
+
244
+ with gr.Column(scale=2):
245
+ gr.HTML("<h3>πŸ“Š Results</h3>")
246
+
247
+ results_summary = gr.Markdown(
248
+ value="Click 'πŸš€ Start Evaluation' to begin...",
249
+ label="Evaluation Summary"
250
+ )
251
+
252
+ gr.HTML("<hr>")
253
+
254
+ with gr.Row():
255
+ gr.HTML("<h3>πŸ“‹ Detailed Results</h3>")
256
+
257
+ with gr.Row():
258
+ detailed_results = gr.Dataframe(
259
+ headers=["Sample_ID", "Question", "Predicted_Answer", "Ground_Truth", "Exact_Match", "F1_Score", "Confidence"],
260
+ label="Sample-by-Sample Results",
261
+ interactive=False,
262
+ wrap=True
263
+ )
264
+
265
+ with gr.Row():
266
+ download_file = gr.File(
267
+ label="πŸ“₯ Download Complete Results (JSON)",
268
+ visible=False
269
+ )
270
+
271
+ # Event handlers
272
+ evaluate_btn.click(
273
+ fn=run_evaluation,
274
+ inputs=[num_samples],
275
+ outputs=[results_summary, detailed_results, download_file],
276
+ show_progress=True
277
+ ).then(
278
+ lambda: gr.update(visible=True),
279
+ outputs=[download_file]
280
+ )
281
+
282
+ # Footer
283
+ gr.HTML("""
284
+ <div style="text-align: center; margin-top: 30px; padding: 20px; color: #666;">
285
+ <p>πŸ€– Powered by Hugging Face Transformers & Gradio</p>
286
+ <p>πŸ“š CUAD Dataset by The Atticus Project</p>
287
+ </div>
288
+ """)
289
+
290
+ return demo
291
+
292
+ if __name__ == "__main__":
293
+ print("CUAD Model Evaluation with Gradio Interface")
294
+ print("=" * 50)
295
+
296
+ # Check if CUDA is available
297
+ if torch.cuda.is_available():
298
+ print(f"βœ“ CUDA available: {torch.cuda.get_device_name(0)}")
299
+ else:
300
+ print("! Running on CPU")
301
+
302
+ # Create and launch Gradio interface
303
+ demo = create_gradio_interface()
304
+ demo.launch(
305
+ server_name="0.0.0.0",
306
+ server_port=7860,
307
+ share=True,
308
+ debug=True
309
+ )