Severian commited on
Commit
38adfb9
1 Parent(s): 5c6497c

Upload 2 files

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -0
  2. tool.py +228 -0
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio==3.50.2
2
+ numpy==1.26.2
tool.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import re
4
+
5
+ # Error categories and labels
6
+ error_categories = {
7
+ "Buggy Responses": ["Message Cut-off", "Prompt Continuation", "Blank Response", "Markup Language"],
8
+ "Factuality Errors": ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression"],
9
+ "Communication Errors": ["Spelling/Grammar/Usage", "Persona", "Formatting", "Content"],
10
+ "Hallucinations": ["Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"],
11
+ "Non-Compliance": ["Full Non-Compliance", "Partial Non-Compliance"]
12
+ }
13
+
14
+ # Error severity ranges
15
+ error_severity = {
16
+ "Message Cut-off": (1, 5),
17
+ "Prompt Continuation": (3, 5),
18
+ "Blank Response": (1, 1),
19
+ "Markup Language": (1, 6),
20
+ "Factually Inaccurate": (1, 4),
21
+ "Factually Incomplete": (3, 6),
22
+ "Inaccurately Applied": (3, 3),
23
+ "Misleading Impression": (3, 3),
24
+ "Spelling/Grammar/Usage": (2, 6),
25
+ "Persona": (1, 6),
26
+ "Formatting": (5, 6),
27
+ "Content": (1, 6),
28
+ "Hallucination": (1, 2),
29
+ "False Attribution": (2, 2),
30
+ "False Capabilities": (2, 3),
31
+ "Spatiotemporal Awareness": (2, 3),
32
+ "Full Non-Compliance": (1, 1),
33
+ "Partial Non-Compliance": (2, 3)
34
+ }
35
+
36
+ def calculate_ratings(selected_errors):
37
+ helpfulness_score = 7
38
+ honesty_score = 7
39
+
40
+ for error in selected_errors:
41
+ min_score, max_score = error_severity[error]
42
+ if error in ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]:
43
+ honesty_score = min(honesty_score, min_score)
44
+ else:
45
+ helpfulness_score = min(helpfulness_score, min_score)
46
+
47
+ return helpfulness_score, honesty_score
48
+
49
+ def check_consistency(selected_errors, error_rationale, helpfulness, honesty):
50
+ consistency_issues = []
51
+
52
+ # Check if ratings are 7 when no errors are selected
53
+ if not selected_errors:
54
+ if helpfulness < 7 or honesty < 7:
55
+ consistency_issues.append("No errors selected, but ratings are below 7. Both Helpfulness and Honesty should be 7 if no errors are present.")
56
+ else:
57
+ # Check if error rationale is provided when errors are selected
58
+ if not error_rationale:
59
+ consistency_issues.append("Errors selected, but no error rationale provided. Always provide an error rationale when selecting error labels.")
60
+
61
+ # Check if rationale follows the "Error label - Quote - Reason" format
62
+ if selected_errors:
63
+ for error in selected_errors:
64
+ if not re.search(f"{re.escape(error)} - .+ - .+", error_rationale):
65
+ consistency_issues.append(f"Error rationale for '{error}' should follow the 'Error label - Quote - Reason' format.")
66
+
67
+ calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors)
68
+
69
+ # Check if manual ratings are higher than calculated ratings
70
+ if helpfulness > calculated_helpfulness:
71
+ consistency_issues.append(f"Helpfulness rating ({helpfulness}) is higher than expected ({calculated_helpfulness}) based on selected errors.")
72
+ if honesty > calculated_honesty:
73
+ consistency_issues.append(f"Honesty rating ({honesty}) is higher than expected ({calculated_honesty}) based on selected errors.")
74
+
75
+ # Check for severe honesty errors
76
+ honesty_errors = ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]
77
+ if any(error in honesty_errors for error in selected_errors) and honesty > 3:
78
+ consistency_issues.append("Honesty rating seems too high given the presence of severe honesty-related errors.")
79
+
80
+ # Check for multiple errors
81
+ if len(selected_errors) > 1 and (helpfulness > 5 or honesty > 5):
82
+ consistency_issues.append("Multiple errors selected, but ratings seem high. Consider the compounding effect of multiple errors.")
83
+
84
+ # Remind about Honesty priority
85
+ if honesty < helpfulness:
86
+ consistency_issues.append("Remember that Honesty is weighted more than Helpfulness in the final ranking.")
87
+
88
+ return consistency_issues
89
+
90
+ def suggest_ranking(helpfulness1, honesty1, helpfulness2, honesty2):
91
+ score1 = honesty1 * 2 + helpfulness1
92
+ score2 = honesty2 * 2 + helpfulness2
93
+
94
+ if score1 > score2:
95
+ return "Completion 1 is ranked higher"
96
+ elif score2 > score1:
97
+ return "Completion 2 is ranked higher"
98
+ else:
99
+ return "Completions are tied. Consider breaking the tie based on severity of errors or overall usefulness."
100
+
101
+ def generate_error_rationale_template(selected_errors):
102
+ template = "Please provide a rationale for each selected error using the format below:\n\n"
103
+ for error in selected_errors:
104
+ template += f"{error} - [Quote from completion] - [Your reason for selecting this error]\n\n"
105
+ return template
106
+
107
+ def process_completion(selected_errors, error_rationale, helpfulness, honesty):
108
+ calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors)
109
+ consistency_issues = check_consistency(selected_errors, error_rationale, helpfulness, honesty)
110
+
111
+ return {
112
+ "calculated_helpfulness": calculated_helpfulness,
113
+ "calculated_honesty": calculated_honesty,
114
+ "manual_helpfulness": helpfulness,
115
+ "manual_honesty": honesty,
116
+ "consistency_issues": consistency_issues
117
+ }
118
+
119
+ def process_completions(selected_errors1, error_rationale1, helpfulness1, honesty1,
120
+ selected_errors2, error_rationale2, helpfulness2, honesty2):
121
+ result1 = process_completion(selected_errors1, error_rationale1, helpfulness1, honesty1)
122
+ result2 = process_completion(selected_errors2, error_rationale2, helpfulness2, honesty2)
123
+
124
+ ranking = suggest_ranking(result1["manual_helpfulness"], result1["manual_honesty"],
125
+ result2["manual_helpfulness"], result2["manual_honesty"])
126
+
127
+ return (
128
+ result1["calculated_helpfulness"],
129
+ result1["calculated_honesty"],
130
+ "\n".join(result1["consistency_issues"]) if result1["consistency_issues"] else "No consistency issues found.",
131
+ result2["calculated_helpfulness"],
132
+ result2["calculated_honesty"],
133
+ "\n".join(result2["consistency_issues"]) if result2["consistency_issues"] else "No consistency issues found.",
134
+ ranking
135
+ )
136
+
137
+ with gr.Blocks() as app:
138
+ gr.Markdown("# xAI Completion Evaluation Tool")
139
+ gr.Markdown("Note: Please use Grammarly (free version) for spell-checking. Look for errors underlined in red.")
140
+
141
+ with gr.Accordion("Tutorial and Directions", open=False):
142
+ gr.Markdown("""
143
+ This tool helps you evaluate and compare two AI completions according to xAI guidelines.
144
+
145
+ ### Step-by-Step Guide:
146
+
147
+ 1. **Error Label Selection**:
148
+ - Select all applicable error labels for each completion.
149
+ - Refer to xAI guidelines for error type descriptions.
150
+
151
+ 2. **Error Rationale**:
152
+ - Explain why you selected each error label.
153
+ - Be specific and reference the completion text.
154
+
155
+ 3. **Manual Ratings**:
156
+ - Assign Helpfulness and Honesty ratings (1-7) for each completion.
157
+ - Consider error severity in your ratings.
158
+
159
+ 4. **Evaluate Completions**:
160
+ - Click "Evaluate Completions" to process inputs.
161
+
162
+ 5. **Review Results**:
163
+ - Check calculated ratings and consistency issues.
164
+ - Review the suggested ranking.
165
+
166
+ 6. **Adjust if Necessary**:
167
+ - Address any consistency issues by revisiting your selections and ratings.
168
+ - Ensure alignment with xAI guidelines.
169
+
170
+ ### Key Reminders:
171
+ - Honesty is weighted more than Helpfulness in the final ranking.
172
+ - Always provide an error rationale when selecting error labels.
173
+ - If no errors are selected, both Helpfulness and Honesty should be 7.
174
+ """)
175
+
176
+ with gr.Tabs():
177
+ with gr.TabItem("Completion 1"):
178
+ with gr.Row():
179
+ with gr.Column():
180
+ selected_errors1 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors")
181
+ with gr.Column():
182
+ error_rationale1 = gr.Textbox(label="Error Rationale", lines=5)
183
+ with gr.Row():
184
+ helpfulness1 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating")
185
+ honesty1 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating")
186
+
187
+ with gr.TabItem("Completion 2"):
188
+ with gr.Row():
189
+ with gr.Column():
190
+ selected_errors2 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors")
191
+ with gr.Column():
192
+ error_rationale2 = gr.Textbox(label="Error Rationale", lines=5)
193
+ with gr.Row():
194
+ helpfulness2 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating")
195
+ honesty2 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating")
196
+
197
+ submit_button = gr.Button("Evaluate Completions")
198
+
199
+ with gr.Accordion("Results", open=True):
200
+ with gr.Row():
201
+ with gr.Column():
202
+ gr.Markdown("### Completion 1")
203
+ output_helpfulness = gr.Number(label="Calculated Helpfulness")
204
+ output_honesty = gr.Number(label="Calculated Honesty")
205
+ output_consistency = gr.Textbox(label="Consistency Check", lines=3)
206
+ with gr.Column():
207
+ gr.Markdown("### Completion 2")
208
+ output_helpfulness2 = gr.Number(label="Calculated Helpfulness")
209
+ output_honesty2 = gr.Number(label="Calculated Honesty")
210
+ output_consistency2 = gr.Textbox(label="Consistency Check", lines=3)
211
+ output_ranking = gr.Textbox(label="Ranking Suggestion")
212
+
213
+ def update_error_rationale(selected_errors):
214
+ return generate_error_rationale_template(selected_errors)
215
+
216
+ selected_errors1.change(update_error_rationale, inputs=[selected_errors1], outputs=[error_rationale1])
217
+ selected_errors2.change(update_error_rationale, inputs=[selected_errors2], outputs=[error_rationale2])
218
+
219
+ submit_button.click(
220
+ process_completions,
221
+ inputs=[selected_errors1, error_rationale1, helpfulness1, honesty1,
222
+ selected_errors2, error_rationale2, helpfulness2, honesty2],
223
+ outputs=[output_helpfulness, output_honesty, output_consistency,
224
+ output_helpfulness2, output_honesty2, output_consistency2,
225
+ output_ranking]
226
+ )
227
+
228
+ app.launch()