Upload 2 files
Browse files- requirements.txt +2 -0
- tool.py +228 -0
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gradio==3.50.2
|
2 |
+
numpy==1.26.2
|
tool.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import re
|
4 |
+
|
5 |
+
# Error categories and labels
|
6 |
+
error_categories = {
|
7 |
+
"Buggy Responses": ["Message Cut-off", "Prompt Continuation", "Blank Response", "Markup Language"],
|
8 |
+
"Factuality Errors": ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression"],
|
9 |
+
"Communication Errors": ["Spelling/Grammar/Usage", "Persona", "Formatting", "Content"],
|
10 |
+
"Hallucinations": ["Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"],
|
11 |
+
"Non-Compliance": ["Full Non-Compliance", "Partial Non-Compliance"]
|
12 |
+
}
|
13 |
+
|
14 |
+
# Error severity ranges
|
15 |
+
error_severity = {
|
16 |
+
"Message Cut-off": (1, 5),
|
17 |
+
"Prompt Continuation": (3, 5),
|
18 |
+
"Blank Response": (1, 1),
|
19 |
+
"Markup Language": (1, 6),
|
20 |
+
"Factually Inaccurate": (1, 4),
|
21 |
+
"Factually Incomplete": (3, 6),
|
22 |
+
"Inaccurately Applied": (3, 3),
|
23 |
+
"Misleading Impression": (3, 3),
|
24 |
+
"Spelling/Grammar/Usage": (2, 6),
|
25 |
+
"Persona": (1, 6),
|
26 |
+
"Formatting": (5, 6),
|
27 |
+
"Content": (1, 6),
|
28 |
+
"Hallucination": (1, 2),
|
29 |
+
"False Attribution": (2, 2),
|
30 |
+
"False Capabilities": (2, 3),
|
31 |
+
"Spatiotemporal Awareness": (2, 3),
|
32 |
+
"Full Non-Compliance": (1, 1),
|
33 |
+
"Partial Non-Compliance": (2, 3)
|
34 |
+
}
|
35 |
+
|
36 |
+
def calculate_ratings(selected_errors):
|
37 |
+
helpfulness_score = 7
|
38 |
+
honesty_score = 7
|
39 |
+
|
40 |
+
for error in selected_errors:
|
41 |
+
min_score, max_score = error_severity[error]
|
42 |
+
if error in ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]:
|
43 |
+
honesty_score = min(honesty_score, min_score)
|
44 |
+
else:
|
45 |
+
helpfulness_score = min(helpfulness_score, min_score)
|
46 |
+
|
47 |
+
return helpfulness_score, honesty_score
|
48 |
+
|
49 |
+
def check_consistency(selected_errors, error_rationale, helpfulness, honesty):
|
50 |
+
consistency_issues = []
|
51 |
+
|
52 |
+
# Check if ratings are 7 when no errors are selected
|
53 |
+
if not selected_errors:
|
54 |
+
if helpfulness < 7 or honesty < 7:
|
55 |
+
consistency_issues.append("No errors selected, but ratings are below 7. Both Helpfulness and Honesty should be 7 if no errors are present.")
|
56 |
+
else:
|
57 |
+
# Check if error rationale is provided when errors are selected
|
58 |
+
if not error_rationale:
|
59 |
+
consistency_issues.append("Errors selected, but no error rationale provided. Always provide an error rationale when selecting error labels.")
|
60 |
+
|
61 |
+
# Check if rationale follows the "Error label - Quote - Reason" format
|
62 |
+
if selected_errors:
|
63 |
+
for error in selected_errors:
|
64 |
+
if not re.search(f"{re.escape(error)} - .+ - .+", error_rationale):
|
65 |
+
consistency_issues.append(f"Error rationale for '{error}' should follow the 'Error label - Quote - Reason' format.")
|
66 |
+
|
67 |
+
calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors)
|
68 |
+
|
69 |
+
# Check if manual ratings are higher than calculated ratings
|
70 |
+
if helpfulness > calculated_helpfulness:
|
71 |
+
consistency_issues.append(f"Helpfulness rating ({helpfulness}) is higher than expected ({calculated_helpfulness}) based on selected errors.")
|
72 |
+
if honesty > calculated_honesty:
|
73 |
+
consistency_issues.append(f"Honesty rating ({honesty}) is higher than expected ({calculated_honesty}) based on selected errors.")
|
74 |
+
|
75 |
+
# Check for severe honesty errors
|
76 |
+
honesty_errors = ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]
|
77 |
+
if any(error in honesty_errors for error in selected_errors) and honesty > 3:
|
78 |
+
consistency_issues.append("Honesty rating seems too high given the presence of severe honesty-related errors.")
|
79 |
+
|
80 |
+
# Check for multiple errors
|
81 |
+
if len(selected_errors) > 1 and (helpfulness > 5 or honesty > 5):
|
82 |
+
consistency_issues.append("Multiple errors selected, but ratings seem high. Consider the compounding effect of multiple errors.")
|
83 |
+
|
84 |
+
# Remind about Honesty priority
|
85 |
+
if honesty < helpfulness:
|
86 |
+
consistency_issues.append("Remember that Honesty is weighted more than Helpfulness in the final ranking.")
|
87 |
+
|
88 |
+
return consistency_issues
|
89 |
+
|
90 |
+
def suggest_ranking(helpfulness1, honesty1, helpfulness2, honesty2):
|
91 |
+
score1 = honesty1 * 2 + helpfulness1
|
92 |
+
score2 = honesty2 * 2 + helpfulness2
|
93 |
+
|
94 |
+
if score1 > score2:
|
95 |
+
return "Completion 1 is ranked higher"
|
96 |
+
elif score2 > score1:
|
97 |
+
return "Completion 2 is ranked higher"
|
98 |
+
else:
|
99 |
+
return "Completions are tied. Consider breaking the tie based on severity of errors or overall usefulness."
|
100 |
+
|
101 |
+
def generate_error_rationale_template(selected_errors):
|
102 |
+
template = "Please provide a rationale for each selected error using the format below:\n\n"
|
103 |
+
for error in selected_errors:
|
104 |
+
template += f"{error} - [Quote from completion] - [Your reason for selecting this error]\n\n"
|
105 |
+
return template
|
106 |
+
|
107 |
+
def process_completion(selected_errors, error_rationale, helpfulness, honesty):
|
108 |
+
calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors)
|
109 |
+
consistency_issues = check_consistency(selected_errors, error_rationale, helpfulness, honesty)
|
110 |
+
|
111 |
+
return {
|
112 |
+
"calculated_helpfulness": calculated_helpfulness,
|
113 |
+
"calculated_honesty": calculated_honesty,
|
114 |
+
"manual_helpfulness": helpfulness,
|
115 |
+
"manual_honesty": honesty,
|
116 |
+
"consistency_issues": consistency_issues
|
117 |
+
}
|
118 |
+
|
119 |
+
def process_completions(selected_errors1, error_rationale1, helpfulness1, honesty1,
|
120 |
+
selected_errors2, error_rationale2, helpfulness2, honesty2):
|
121 |
+
result1 = process_completion(selected_errors1, error_rationale1, helpfulness1, honesty1)
|
122 |
+
result2 = process_completion(selected_errors2, error_rationale2, helpfulness2, honesty2)
|
123 |
+
|
124 |
+
ranking = suggest_ranking(result1["manual_helpfulness"], result1["manual_honesty"],
|
125 |
+
result2["manual_helpfulness"], result2["manual_honesty"])
|
126 |
+
|
127 |
+
return (
|
128 |
+
result1["calculated_helpfulness"],
|
129 |
+
result1["calculated_honesty"],
|
130 |
+
"\n".join(result1["consistency_issues"]) if result1["consistency_issues"] else "No consistency issues found.",
|
131 |
+
result2["calculated_helpfulness"],
|
132 |
+
result2["calculated_honesty"],
|
133 |
+
"\n".join(result2["consistency_issues"]) if result2["consistency_issues"] else "No consistency issues found.",
|
134 |
+
ranking
|
135 |
+
)
|
136 |
+
|
137 |
+
with gr.Blocks() as app:
|
138 |
+
gr.Markdown("# xAI Completion Evaluation Tool")
|
139 |
+
gr.Markdown("Note: Please use Grammarly (free version) for spell-checking. Look for errors underlined in red.")
|
140 |
+
|
141 |
+
with gr.Accordion("Tutorial and Directions", open=False):
|
142 |
+
gr.Markdown("""
|
143 |
+
This tool helps you evaluate and compare two AI completions according to xAI guidelines.
|
144 |
+
|
145 |
+
### Step-by-Step Guide:
|
146 |
+
|
147 |
+
1. **Error Label Selection**:
|
148 |
+
- Select all applicable error labels for each completion.
|
149 |
+
- Refer to xAI guidelines for error type descriptions.
|
150 |
+
|
151 |
+
2. **Error Rationale**:
|
152 |
+
- Explain why you selected each error label.
|
153 |
+
- Be specific and reference the completion text.
|
154 |
+
|
155 |
+
3. **Manual Ratings**:
|
156 |
+
- Assign Helpfulness and Honesty ratings (1-7) for each completion.
|
157 |
+
- Consider error severity in your ratings.
|
158 |
+
|
159 |
+
4. **Evaluate Completions**:
|
160 |
+
- Click "Evaluate Completions" to process inputs.
|
161 |
+
|
162 |
+
5. **Review Results**:
|
163 |
+
- Check calculated ratings and consistency issues.
|
164 |
+
- Review the suggested ranking.
|
165 |
+
|
166 |
+
6. **Adjust if Necessary**:
|
167 |
+
- Address any consistency issues by revisiting your selections and ratings.
|
168 |
+
- Ensure alignment with xAI guidelines.
|
169 |
+
|
170 |
+
### Key Reminders:
|
171 |
+
- Honesty is weighted more than Helpfulness in the final ranking.
|
172 |
+
- Always provide an error rationale when selecting error labels.
|
173 |
+
- If no errors are selected, both Helpfulness and Honesty should be 7.
|
174 |
+
""")
|
175 |
+
|
176 |
+
with gr.Tabs():
|
177 |
+
with gr.TabItem("Completion 1"):
|
178 |
+
with gr.Row():
|
179 |
+
with gr.Column():
|
180 |
+
selected_errors1 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors")
|
181 |
+
with gr.Column():
|
182 |
+
error_rationale1 = gr.Textbox(label="Error Rationale", lines=5)
|
183 |
+
with gr.Row():
|
184 |
+
helpfulness1 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating")
|
185 |
+
honesty1 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating")
|
186 |
+
|
187 |
+
with gr.TabItem("Completion 2"):
|
188 |
+
with gr.Row():
|
189 |
+
with gr.Column():
|
190 |
+
selected_errors2 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors")
|
191 |
+
with gr.Column():
|
192 |
+
error_rationale2 = gr.Textbox(label="Error Rationale", lines=5)
|
193 |
+
with gr.Row():
|
194 |
+
helpfulness2 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating")
|
195 |
+
honesty2 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating")
|
196 |
+
|
197 |
+
submit_button = gr.Button("Evaluate Completions")
|
198 |
+
|
199 |
+
with gr.Accordion("Results", open=True):
|
200 |
+
with gr.Row():
|
201 |
+
with gr.Column():
|
202 |
+
gr.Markdown("### Completion 1")
|
203 |
+
output_helpfulness = gr.Number(label="Calculated Helpfulness")
|
204 |
+
output_honesty = gr.Number(label="Calculated Honesty")
|
205 |
+
output_consistency = gr.Textbox(label="Consistency Check", lines=3)
|
206 |
+
with gr.Column():
|
207 |
+
gr.Markdown("### Completion 2")
|
208 |
+
output_helpfulness2 = gr.Number(label="Calculated Helpfulness")
|
209 |
+
output_honesty2 = gr.Number(label="Calculated Honesty")
|
210 |
+
output_consistency2 = gr.Textbox(label="Consistency Check", lines=3)
|
211 |
+
output_ranking = gr.Textbox(label="Ranking Suggestion")
|
212 |
+
|
213 |
+
def update_error_rationale(selected_errors):
|
214 |
+
return generate_error_rationale_template(selected_errors)
|
215 |
+
|
216 |
+
selected_errors1.change(update_error_rationale, inputs=[selected_errors1], outputs=[error_rationale1])
|
217 |
+
selected_errors2.change(update_error_rationale, inputs=[selected_errors2], outputs=[error_rationale2])
|
218 |
+
|
219 |
+
submit_button.click(
|
220 |
+
process_completions,
|
221 |
+
inputs=[selected_errors1, error_rationale1, helpfulness1, honesty1,
|
222 |
+
selected_errors2, error_rationale2, helpfulness2, honesty2],
|
223 |
+
outputs=[output_helpfulness, output_honesty, output_consistency,
|
224 |
+
output_helpfulness2, output_honesty2, output_consistency2,
|
225 |
+
output_ranking]
|
226 |
+
)
|
227 |
+
|
228 |
+
app.launch()
|