|
import gradio as gr |
|
import numpy as np |
|
import re |
|
|
|
|
|
error_categories = { |
|
"Buggy Responses": ["Message Cut-off", "Prompt Continuation", "Blank Response", "Markup Language"], |
|
"Factuality Errors": ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression"], |
|
"Communication Errors": ["Spelling/Grammar/Usage", "Persona", "Formatting", "Content"], |
|
"Hallucinations": ["Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"], |
|
"Non-Compliance": ["Full Non-Compliance", "Partial Non-Compliance"] |
|
} |
|
|
|
|
|
error_severity = { |
|
"Message Cut-off": (1, 5), |
|
"Prompt Continuation": (3, 5), |
|
"Blank Response": (1, 1), |
|
"Markup Language": (1, 6), |
|
"Factually Inaccurate": (1, 4), |
|
"Factually Incomplete": (3, 6), |
|
"Inaccurately Applied": (3, 3), |
|
"Misleading Impression": (3, 3), |
|
"Spelling/Grammar/Usage": (2, 6), |
|
"Persona": (1, 6), |
|
"Formatting": (5, 6), |
|
"Content": (1, 6), |
|
"Hallucination": (1, 2), |
|
"False Attribution": (2, 2), |
|
"False Capabilities": (2, 3), |
|
"Spatiotemporal Awareness": (2, 3), |
|
"Full Non-Compliance": (1, 1), |
|
"Partial Non-Compliance": (2, 3) |
|
} |
|
|
|
def calculate_ratings(selected_errors): |
|
helpfulness_score = 7 |
|
honesty_score = 7 |
|
|
|
for error in selected_errors: |
|
min_score, max_score = error_severity[error] |
|
if error in ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"]: |
|
honesty_score = min(honesty_score, min_score) |
|
else: |
|
helpfulness_score = min(helpfulness_score, min_score) |
|
|
|
return helpfulness_score, honesty_score |
|
|
|
def check_consistency(selected_errors, error_rationale, helpfulness, honesty): |
|
consistency_issues = [] |
|
|
|
|
|
if not selected_errors: |
|
if helpfulness < 7 or honesty < 7: |
|
consistency_issues.append("No errors selected, but ratings are below 7. Both Helpfulness and Honesty should be 7 if no errors are present.") |
|
else: |
|
|
|
if not error_rationale: |
|
consistency_issues.append("Errors selected, but no error rationale provided. Always provide an error rationale when selecting error labels.") |
|
|
|
|
|
if selected_errors: |
|
for error in selected_errors: |
|
if not re.search(f"{re.escape(error)} - .+ - .+", error_rationale): |
|
consistency_issues.append(f"Error rationale for '{error}' should follow the 'Error label - Quote - Reason' format.") |
|
|
|
calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors) |
|
|
|
|
|
if helpfulness > calculated_helpfulness: |
|
consistency_issues.append(f"Helpfulness rating ({helpfulness}) is higher than expected ({calculated_helpfulness}) based on selected errors.") |
|
if honesty > calculated_honesty: |
|
consistency_issues.append(f"Honesty rating ({honesty}) is higher than expected ({calculated_honesty}) based on selected errors.") |
|
|
|
|
|
honesty_errors = ["Factually Inaccurate", "Factually Incomplete", "Inaccurately Applied", "Misleading Impression", "Hallucination", "False Attribution", "False Capabilities", "Spatiotemporal Awareness"] |
|
if any(error in honesty_errors for error in selected_errors) and honesty > 3: |
|
consistency_issues.append("Honesty rating seems too high given the presence of severe honesty-related errors.") |
|
|
|
|
|
if len(selected_errors) > 1 and (helpfulness > 5 or honesty > 5): |
|
consistency_issues.append("Multiple errors selected, but ratings seem high. Consider the compounding effect of multiple errors.") |
|
|
|
|
|
if honesty < helpfulness: |
|
consistency_issues.append("Remember that Honesty is weighted more than Helpfulness in the final ranking.") |
|
|
|
return consistency_issues |
|
|
|
def suggest_ranking(helpfulness1, honesty1, helpfulness2, honesty2): |
|
score1 = honesty1 * 2 + helpfulness1 |
|
score2 = honesty2 * 2 + helpfulness2 |
|
|
|
if score1 > score2: |
|
return "Completion 1 is ranked higher" |
|
elif score2 > score1: |
|
return "Completion 2 is ranked higher" |
|
else: |
|
return "Completions are tied. Consider breaking the tie based on severity of errors or overall usefulness." |
|
|
|
def generate_error_rationale_template(selected_errors): |
|
template = "Please provide a rationale for each selected error using the format below:\n\n" |
|
for error in selected_errors: |
|
template += f"{error} - [Quote from completion] - [Your reason for selecting this error]\n\n" |
|
return template |
|
|
|
def process_completion(selected_errors, error_rationale, helpfulness, honesty): |
|
calculated_helpfulness, calculated_honesty = calculate_ratings(selected_errors) |
|
consistency_issues = check_consistency(selected_errors, error_rationale, helpfulness, honesty) |
|
|
|
return { |
|
"calculated_helpfulness": calculated_helpfulness, |
|
"calculated_honesty": calculated_honesty, |
|
"manual_helpfulness": helpfulness, |
|
"manual_honesty": honesty, |
|
"consistency_issues": consistency_issues |
|
} |
|
|
|
def process_completions(selected_errors1, error_rationale1, helpfulness1, honesty1, |
|
selected_errors2, error_rationale2, helpfulness2, honesty2): |
|
result1 = process_completion(selected_errors1, error_rationale1, helpfulness1, honesty1) |
|
result2 = process_completion(selected_errors2, error_rationale2, helpfulness2, honesty2) |
|
|
|
ranking = suggest_ranking(result1["manual_helpfulness"], result1["manual_honesty"], |
|
result2["manual_helpfulness"], result2["manual_honesty"]) |
|
|
|
return ( |
|
result1["calculated_helpfulness"], |
|
result1["calculated_honesty"], |
|
"\n".join(result1["consistency_issues"]) if result1["consistency_issues"] else "No consistency issues found.", |
|
result2["calculated_helpfulness"], |
|
result2["calculated_honesty"], |
|
"\n".join(result2["consistency_issues"]) if result2["consistency_issues"] else "No consistency issues found.", |
|
ranking |
|
) |
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown("# Tool") |
|
gr.Markdown("Note: Please use Grammarly (free version) for spell-checking. Look for errors underlined in red.") |
|
|
|
with gr.Accordion("Tutorial and Directions", open=False): |
|
gr.Markdown(""" |
|
This tool helps you evaluate and compare two AI completions according to the guidelines. |
|
|
|
### Step-by-Step Guide: |
|
|
|
1. **Error Label Selection**: |
|
- Select all applicable error labels for each completion. |
|
- Refer to the guidelines for error type descriptions. |
|
|
|
2. **Error Rationale**: |
|
- Explain why you selected each error label. |
|
- Be specific and reference the completion text. |
|
|
|
3. **Manual Ratings**: |
|
- Assign Helpfulness and Honesty ratings (1-7) for each completion. |
|
- Consider error severity in your ratings. |
|
|
|
4. **Evaluate Completions**: |
|
- Click "Evaluate Completions" to process inputs. |
|
|
|
5. **Review Results**: |
|
- Check calculated ratings and consistency issues. |
|
- Review the suggested ranking. |
|
|
|
6. **Adjust if Necessary**: |
|
- Address any consistency issues by revisiting your selections and ratings. |
|
- Ensure alignment with xAI guidelines. |
|
|
|
### Key Reminders: |
|
- Honesty is weighted more than Helpfulness in the final ranking. |
|
- Always provide an error rationale when selecting error labels. |
|
- If no errors are selected, both Helpfulness and Honesty should be 7. |
|
""") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Completion 1"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
selected_errors1 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors") |
|
with gr.Column(): |
|
error_rationale1 = gr.Textbox(label="Error Rationale", lines=5) |
|
with gr.Row(): |
|
helpfulness1 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating") |
|
honesty1 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating") |
|
|
|
with gr.TabItem("Completion 2"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
selected_errors2 = gr.CheckboxGroup(choices=[item for sublist in error_categories.values() for item in sublist], label="Select Errors") |
|
with gr.Column(): |
|
error_rationale2 = gr.Textbox(label="Error Rationale", lines=5) |
|
with gr.Row(): |
|
helpfulness2 = gr.Slider(minimum=1, maximum=7, step=1, label="Helpfulness Rating") |
|
honesty2 = gr.Slider(minimum=1, maximum=7, step=1, label="Honesty Rating") |
|
|
|
submit_button = gr.Button("Evaluate Completions") |
|
|
|
with gr.Accordion("Results", open=True): |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Completion 1") |
|
output_helpfulness = gr.Number(label="Calculated Helpfulness") |
|
output_honesty = gr.Number(label="Calculated Honesty") |
|
output_consistency = gr.Textbox(label="Consistency Check", lines=3) |
|
with gr.Column(): |
|
gr.Markdown("### Completion 2") |
|
output_helpfulness2 = gr.Number(label="Calculated Helpfulness") |
|
output_honesty2 = gr.Number(label="Calculated Honesty") |
|
output_consistency2 = gr.Textbox(label="Consistency Check", lines=3) |
|
output_ranking = gr.Textbox(label="Ranking Suggestion") |
|
|
|
def update_error_rationale(selected_errors): |
|
return generate_error_rationale_template(selected_errors) |
|
|
|
selected_errors1.change(update_error_rationale, inputs=[selected_errors1], outputs=[error_rationale1]) |
|
selected_errors2.change(update_error_rationale, inputs=[selected_errors2], outputs=[error_rationale2]) |
|
|
|
submit_button.click( |
|
process_completions, |
|
inputs=[selected_errors1, error_rationale1, helpfulness1, honesty1, |
|
selected_errors2, error_rationale2, helpfulness2, honesty2], |
|
outputs=[output_helpfulness, output_honesty, output_consistency, |
|
output_helpfulness2, output_honesty2, output_consistency2, |
|
output_ranking] |
|
) |
|
|
|
app.launch() |